# Preparing Dataset

## Reading DataFrames from multiple files

In [7]:
# import pandas
import pandas as pd

# read 'Bronze.csv' into a DataFrame: bronze
bronze = pd.read_csv("Bronze.csv")

# Read 'Silver.csv' into a DataFrame: silver
silver = pd.read_csv('Silver.csv')

# Read 'Gold.csv' into a DataFrame: gold
gold = pd.read_csv("Gold.csv")

# Print the first five rows of gold
print(gold.head())

   NOC         Country   Total
0  USA   United States  2088.0
1  URS    Soviet Union   838.0
2  GBR  United Kingdom   498.0
3  FRA          France   378.0
4  GER         Germany   407.0


## Reading DataFrames from multiple files in a loop

In [13]:
# Import pandas
import pandas as pd

# Create the list of file names: filenames
filenames = ['Gold.csv', 'Silver.csv', 'Bronze.csv']

# Create the list of three DataFrames: dataframes
datframes = []
for filename in filenames:
    dataframes.append(pd.read_csv(filename))

# Print top 5 rows of 1st DataFrame in dataframes
print(dataframes[0].head())

   NOC         Country   Total
0  USA   United States  2088.0
1  URS    Soviet Union   838.0
2  GBR  United Kingdom   498.0
3  FRA          France   378.0
4  GER         Germany   407.0


## Combining DataFrames from multiple data files

In [15]:
# Import pandas
import pandas as pd

# Make a copy of gold: medals
medals = gold.copy()

# Create list of new column labels: new_labels
new_labels = ['NOC', 'Country', 'Gold']

# Rename the columns of medals using new_labels
medals.columns = new_labels

# Add columns 'Silver' & 'Bronze' to medals
medals['Silver'] = silver['Total']
medals['Bronze'] = bronze['Total']

# Print the head of medals
print(medals.head())

   NOC         Country    Gold  Silver  Bronze
0  USA   United States  2088.0  1195.0  1052.0
1  URS    Soviet Union   838.0   627.0   584.0
2  GBR  United Kingdom   498.0   591.0   505.0
3  FRA          France   378.0   461.0   475.0
4  GER         Germany   407.0   350.0   454.0


## Sorting DataFrame with the Index & columns

In [32]:
# Import pandas
import pandas as pd

# Read 'monthly_max_temp.csv' into a DataFrame: weather1
weather1 = pd.read_csv('monthly_max_temp.csv', index_col='month')

# Print the head of weather1
print(weather1.head())

       record_id  day  year  AverageTemperatureFahr  \
month                                                 
1         474376    1  1853                     NaN   
2         474377    1  1853                     NaN   
3         474378    1  1853                     NaN   
4         474379    1  1853                     NaN   
5         474380    1  1853                     NaN   

       AverageTemperatureUncertaintyFahr      City country_id      Country  \
month                                                                        
1                                    NaN  Auckland        NEW  New Zealand   
2                                    NaN  Auckland        NEW  New Zealand   
3                                    NaN  Auckland        NEW  New Zealand   
4                                    NaN  Auckland        NEW  New Zealand   
5                                    NaN  Auckland        NEW  New Zealand   

      Latitude Longitude  
month                     
1       36.17

In [33]:
# Sort the index of weather1 in alphabetical order: weather2
weather2 = weather1.sort_index()

# Print the head of weather2
print(weather2.head())

       record_id  day  year  AverageTemperatureFahr  \
month                                                 
1         474376    1  1853                     NaN   
1        5412831    1  1761                 29.3288   
1        1346967    1  1843                 75.4772   
1        5412843    1  1762                 39.0758   
1        5412855    1  1763                 28.8176   

       AverageTemperatureUncertaintyFahr      City country_id      Country  \
month                                                                        
1                                    NaN  Auckland        NEW  New Zealand   
1                                41.2880     Odesa        UKR      Ukraine   
1                                34.8566    Canoas        BRA       Brazil   
1                                37.1462     Odesa        UKR      Ukraine   
1                                41.2178     Odesa        UKR      Ukraine   

      Latitude Longitude  
month                     
1       36.17

In [34]:
# Sort the index of weather1 in reverse alphabetical order: weather3
weather3 = weather1.sort_index(ascending=False)

# Print the head of weather3
print(weather3.head())

       record_id  day  year  AverageTemperatureFahr  \
month                                                 
12       5413598    1  1824                 44.6720   
12       3503394    1  2008                 68.5814   
12       3503310    1  2001                 66.0164   
12       1347782    1  1910                 71.3660   
12       5700481    1  1843                 40.2512   

       AverageTemperatureUncertaintyFahr          City country_id  \
month                                                               
12                               39.4412         Odesa        UKR   
12                               32.4842  Johannesburg        SOU   
12                               32.7416  Johannesburg        SOU   
12                               34.1690           NaN        NaN   
12                               37.2704         Paris        FRA   

            Country Latitude Longitude  
month                                   
12          Ukraine   45.81N    31.15E  
12     

In [35]:
# Sort weather1 numerically using the values of 'Max TemperatureF': weather4
weather4 = weather1.sort_values('AverageTemperatureFahr')

# Print the head of weather4
print(weather4.head())

       record_id  day  year  AverageTemperatureFahr  \
month                                                 
2        3819490    1  1929                  2.8562   
1        3819645    1  1942                  3.5456   
1        3820185    1  1987                  4.5050   
1        3819897    1  1963                  4.8344   
1        3819057    1  1893                  5.0378   

       AverageTemperatureUncertaintyFahr  City country_id  Country Latitude  \
month                                                                         
2                                33.0872  Kiev        UKR  Ukraine   50.63N   
1                                33.7946  Kiev        UKR  Ukraine   50.63N   
1                                32.7956  Kiev        UKR  Ukraine   50.63N   
1                                33.6092  Kiev        UKR  Ukraine   50.63N   
1                                33.0386  Kiev        UKR  Ukraine   50.63N   

      Longitude  
month            
2        31.69E  
1     

## Reindexing DataFrame from a list

In [57]:
# Import pandas
import pandas as pd

# Reindex weather1 using the list year: weather2
weather2 = weather1.reset_index()
weather2 = weather2.set_index('year')

# Print weather2
print(weather2)

# Reindex weather1 using the list year with forward-fill: weather3
weather3 = weather1.reindex(year).ffill()

# Print weather3
print(weather3)

      month  record_id  day  AverageTemperatureFahr  \
year                                                  
1853      1     474376    1                     NaN   
1853      2     474377    1                     NaN   
1853      3     474378    1                     NaN   
1853      4     474379    1                     NaN   
1853      5     474380    1                     NaN   
1853      6     474381    1                 51.9062   
1853      7     474382    1                 52.3886   
1853      8     474383    1                 52.8530   
1853      9     474384    1                 52.5776   
1853     10     474385    1                 54.8726   
1853     11     474386    1                 56.6888   
1853     12     474387    1                 59.8460   
1854      1     474388    1                 64.5908   
1854      2     474389    1                 65.3720   
1854      3     474390    1                 64.9688   
1854      4     474391    1                 59.9270   
1854      

NameError: name 'year' is not defined

## Reindexing using another DataFrame Index

In [61]:
names_1981 = pd.read_csv('names1981.csv', header=None, names=['name','gender','count'], index_col=(0,1))

In [70]:
# Import pandas
import pandas as pd

# Reindex names_1981 with index of names_1881: common_names
common_names = names_1981.reindex(names_1981.index)

# Print shape of common_names
print(common_names.shape)

(19455, 1)


In [71]:
# Drop rows with null counts: common_names
common_names = common_names.dropna()

# Print shape of new common_names
print(common_names.shape)

(19455, 1)


## Broadcasting in arithmetic formulas

In [72]:
# Extract selected columns from weather as new DataFrame: temps_f
temps_f = weather1[['Min TemperatureF', 'Mean TemperatureF', 'Max TemperatureF']]

# Convert temps_f to celsius: temps_c
temps_c = (temps_f - 32) * 5/9

# Rename 'F' in column names with 'C': temps_c.columns
temps_c.columns = temps_c.columns.str.replace('F', 'C')

# Print first 5 rows of temps_c
print(temps_c.head())

KeyError: "['Min TemperatureF' 'Mean TemperatureF' 'Max TemperatureF'] not in index"

## Computing percentage growth of GDP

In [76]:
import pandas as pd

# Read 'GDP.csv' into a DataFrame: gdp
gdp = pd.read_csv('gdp_usa.csv', parse_dates=True, index_col='DATE')
gdp

Unnamed: 0_level_0,VALUE
DATE,Unnamed: 1_level_1
1947-01-01,243.1
1947-04-01,246.3
1947-07-01,250.1
1947-10-01,260.3
1948-01-01,266.2
1948-04-01,272.9
1948-07-01,279.5
1948-10-01,280.7
1949-01-01,275.4
1949-04-01,271.7


In [78]:
# Slice all the gdp data from 2008 onward: post2008
post2008 = gdp.loc['2008':]

# Print the last 8 rows of post2008
print(post2008.tail(8))

              VALUE
DATE               
2014-07-01  17569.4
2014-10-01  17692.2
2015-01-01  17783.6
2015-04-01  17998.3
2015-07-01  18141.9
2015-10-01  18222.8
2016-01-01  18281.6
2016-04-01  18436.5


In [80]:
# Resample post2008 by year, keeping last(): yearly
yearly = post2008.resample('A').last()

# Print yearly
print(yearly)

              VALUE
DATE               
2008-12-31  14549.9
2009-12-31  14566.5
2010-12-31  15230.2
2011-12-31  15785.3
2012-12-31  16297.3
2013-12-31  16999.9
2014-12-31  17692.2
2015-12-31  18222.8
2016-12-31  18436.5


In [81]:
# Compute percentage growth of yearly: yearly['growth']
yearly['growth'] = yearly.pct_change() * 100

# Print yearly again
print(yearly)

              VALUE    growth
DATE                         
2008-12-31  14549.9       NaN
2009-12-31  14566.5  0.114090
2010-12-31  15230.2  4.556345
2011-12-31  15785.3  3.644732
2012-12-31  16297.3  3.243524
2013-12-31  16999.9  4.311144
2014-12-31  17692.2  4.072377
2015-12-31  18222.8  2.999062
2016-12-31  18436.5  1.172707


## Converting currency of stocks

In [83]:
# Import pandas
import pandas as pd

# Read 'sp500.csv' into a DataFrame: sp500
sp500 = pd.read_csv('sp500.csv', parse_dates=True, index_col='Date')

# Read 'exchange.csv' into a DataFrame: exchange
exchange = pd.read_csv('exchange.csv', parse_dates=True, index_col='Date')

# print the head of 'sp500' and 'exchange'
print(sp500.head())
print(exchange.head())

                   Open         High          Low        Close      Volume  \
Date                                                                         
2015-01-02  2058.899902  2072.360107  2046.040039  2058.199951  2708700000   
2015-01-05  2054.439941  2054.439941  2017.339966  2020.579956  3799120000   
2015-01-06  2022.150024  2030.250000  1992.439941  2002.609985  4460110000   
2015-01-07  2005.550049  2029.609985  2005.550049  2025.900024  3805480000   
2015-01-08  2030.609985  2064.080078  2030.609985  2062.139893  3934010000   

              Adj Close  
Date                     
2015-01-02  2058.199951  
2015-01-05  2020.579956  
2015-01-06  2002.609985  
2015-01-07  2025.900024  
2015-01-08  2062.139893  
            GBP/USD
Date               
2015-01-02  0.65101
2015-01-05  0.65644
2015-01-06  0.65896
2015-01-07  0.66344
2015-01-08  0.66151


In [85]:
# Subset 'Open' & 'Close' columns from sp500: dollars
dollars = sp500[['Open', 'Close']]

# Print the head of dollars
print(dollars.head())

                   Open        Close
Date                                
2015-01-02  2058.899902  2058.199951
2015-01-05  2054.439941  2020.579956
2015-01-06  2022.150024  2002.609985
2015-01-07  2005.550049  2025.900024
2015-01-08  2030.609985  2062.139893


In [86]:
# Convert dollars to pounds: pounds
pounds = dollars.multiply(exchange['GBP/USD'],axis='rows')

# Print the head of pounds
print(pounds.head())

                   Open        Close
Date                                
2015-01-02  1340.364425  1339.908750
2015-01-05  1348.616555  1326.389506
2015-01-06  1332.515980  1319.639876
2015-01-07  1330.562125  1344.063112
2015-01-08  1343.268811  1364.126161
