In [1]:
#dependencies

import pandas as pd

In [2]:
# urls to be scraped

url_min = 'http://www.usa.com/rank/us--average-min-temperature--state-rank.htm?yr=9000&dis=&wist=&plow=&phigh='
url_max = 'http://www.usa.com/rank/us--average-max-temperature--state-rank.htm?yr=9000&dis=&wist=&plow=&phigh='

In [3]:
# use pandas to read html tables

table_min = pd.read_html(url_min)
table_min

[       0                           1                               2
 0   Rank  Average Min. Temperature ▼              State / Population
 1     1.                     65.09°F              Hawaii / 1,392,704
 2     2.                     61.50°F            Florida / 19,361,792
 3     3.                     56.07°F           Louisiana / 4,601,049
 4     4.                     54.16°F              Texas / 26,092,033
 5     5.                     52.26°F         Mississippi / 2,984,345
 6     6.                     51.31°F             Arizona / 6,561,516
 7     7.                     51.01°F             Alabama / 4,817,678
 8     8.                     50.69°F             Georgia / 9,907,756
 9     9.                     49.82°F      South Carolina / 4,727,273
 10   10.                     48.97°F         California / 38,066,920
 11   11.                     48.72°F            Arkansas / 2,947,036
 12   12.                     48.17°F            Oklahoma / 3,818,851
 13   13.           

In [4]:
table_max = pd.read_html(url_max)
table_max

[       0                           1                               2
 0   Rank  Average Max. Temperature ▼              State / Population
 1     1.                     81.97°F            Florida / 19,361,792
 2     2.                     80.66°F              Hawaii / 1,392,704
 3     3.                     80.59°F             Arizona / 6,561,516
 4     4.                     77.73°F              Texas / 26,092,033
 5     5.                     77.33°F           Louisiana / 4,601,049
 6     6.                     74.67°F         Mississippi / 2,984,345
 7     7.                     74.31°F             Alabama / 4,817,678
 8     8.                     73.90°F             Georgia / 9,907,756
 9     9.                     73.58°F      South Carolina / 4,727,273
 10   10.                     73.36°F         California / 38,066,920
 11   11.                     72.04°F            Oklahoma / 3,818,851
 12   12.                     71.64°F              Nevada / 2,761,584
 13   13.           

## Average Min Temperature Values

In [5]:
# transfer list into dataframe

min_df = table_min[0]
min_df.head()

Unnamed: 0,0,1,2
0,Rank,Average Min. Temperature ▼,State / Population
1,1.,65.09°F,"Hawaii / 1,392,704"
2,2.,61.50°F,"Florida / 19,361,792"
3,3.,56.07°F,"Louisiana / 4,601,049"
4,4.,54.16°F,"Texas / 26,092,033"


In [6]:
# make column headers

min_df.columns = min_df.iloc[0]
min_df.head()

Unnamed: 0,Rank,Average Min. Temperature ▼,State / Population
0,Rank,Average Min. Temperature ▼,State / Population
1,1.,65.09°F,"Hawaii / 1,392,704"
2,2.,61.50°F,"Florida / 19,361,792"
3,3.,56.07°F,"Louisiana / 4,601,049"
4,4.,54.16°F,"Texas / 26,092,033"


In [7]:
# drop first line

min_df = min_df.drop(min_df.index[0])

In [8]:
# drop rank column

min_df = min_df[['Average Min. Temperature ▼', 'State / Population']]
min_df.head()

Unnamed: 0,Average Min. Temperature ▼,State / Population
1,65.09°F,"Hawaii / 1,392,704"
2,61.50°F,"Florida / 19,361,792"
3,56.07°F,"Louisiana / 4,601,049"
4,54.16°F,"Texas / 26,092,033"
5,52.26°F,"Mississippi / 2,984,345"


In [9]:
# rename temp column 

min_df = min_df.rename(columns={'Average Min. Temperature ▼':'Avg Min Temp (F)'})
min_df.head()

Unnamed: 0,Avg Min Temp (F),State / Population
1,65.09°F,"Hawaii / 1,392,704"
2,61.50°F,"Florida / 19,361,792"
3,56.07°F,"Louisiana / 4,601,049"
4,54.16°F,"Texas / 26,092,033"
5,52.26°F,"Mississippi / 2,984,345"


In [10]:
# split State/Populatoin column

split = min_df['State / Population'].str.split(" / ", n = 1, expand = True) 
  
# making separate State column from new data frame 
min_df["State"]= split[0]
  
# Dropping old column 
min_df.drop(columns =['State / Population'], inplace = True)

In [11]:
# split F from temp values 

split_2 = min_df['Avg Min Temp (F)'].str.split('°F', n=1, expand=True)

min_df['Avg Min Temp (°F)'] = split_2[0]

min_df.drop(columns=['Avg Min Temp (F)'], inplace=True)

min_df.head()

Unnamed: 0,State,Avg Min Temp (°F)
1,Hawaii,65.09
2,Florida,61.5
3,Louisiana,56.07
4,Texas,54.16
5,Mississippi,52.26


In [12]:
min_df.dtypes

0
State                object
Avg Min Temp (°F)    object
dtype: object

In [13]:
# change temperature to numeric value

min_df['Avg Min Temp (°F)'] = pd.to_numeric(min_df['Avg Min Temp (°F)'])

In [14]:
min_df.dtypes

0
State                 object
Avg Min Temp (°F)    float64
dtype: object

In [15]:
min_df.to_csv("Resources/avg_min_temp.csv", index=False, header=True)

## Average Max Temperature

In [16]:
# transfer list into dataframe

max_df = table_max[0]
max_df.head()

Unnamed: 0,0,1,2
0,Rank,Average Max. Temperature ▼,State / Population
1,1.,81.97°F,"Florida / 19,361,792"
2,2.,80.66°F,"Hawaii / 1,392,704"
3,3.,80.59°F,"Arizona / 6,561,516"
4,4.,77.73°F,"Texas / 26,092,033"


In [17]:
# make column headers

max_df.columns = max_df.iloc[0]
max_df.head()

Unnamed: 0,Rank,Average Max. Temperature ▼,State / Population
0,Rank,Average Max. Temperature ▼,State / Population
1,1.,81.97°F,"Florida / 19,361,792"
2,2.,80.66°F,"Hawaii / 1,392,704"
3,3.,80.59°F,"Arizona / 6,561,516"
4,4.,77.73°F,"Texas / 26,092,033"


In [18]:
# drop first line

max_df = max_df.drop(max_df.index[0])

In [19]:
# drop rank column

max_df = max_df[['Average Max. Temperature ▼', 'State / Population']]
max_df.head()

Unnamed: 0,Average Max. Temperature ▼,State / Population
1,81.97°F,"Florida / 19,361,792"
2,80.66°F,"Hawaii / 1,392,704"
3,80.59°F,"Arizona / 6,561,516"
4,77.73°F,"Texas / 26,092,033"
5,77.33°F,"Louisiana / 4,601,049"


In [20]:
# rename temp column 

max_df = max_df.rename(columns={'Average Max. Temperature ▼':'Avg Max Temp (F)'})
max_df.head()

Unnamed: 0,Avg Max Temp (F),State / Population
1,81.97°F,"Florida / 19,361,792"
2,80.66°F,"Hawaii / 1,392,704"
3,80.59°F,"Arizona / 6,561,516"
4,77.73°F,"Texas / 26,092,033"
5,77.33°F,"Louisiana / 4,601,049"


In [21]:
# split State/Populatoin column

split_3 = max_df['State / Population'].str.split(" / ", n = 1, expand = True) 
  
# making separate State column from new data frame 
max_df["State"]= split_3[0]
  
# Dropping old column 
max_df.drop(columns =['State / Population'], inplace = True)

In [22]:
# split F from temp values 

split_4 = max_df['Avg Max Temp (F)'].str.split('°F', n=1, expand=True)

max_df['Avg Max Temp (°F)'] = split_4[0]

max_df.drop(columns=['Avg Max Temp (F)'], inplace=True)

max_df.head()

Unnamed: 0,State,Avg Max Temp (°F)
1,Florida,81.97
2,Hawaii,80.66
3,Arizona,80.59
4,Texas,77.73
5,Louisiana,77.33


In [23]:
# change temperature to numeric value

max_df['Avg Max Temp (°F)'] = pd.to_numeric(max_df['Avg Max Temp (°F)'])

In [24]:
max_df.to_csv("Resources/avg_max_temp.csv", index=False, header=True)