In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [2]:
file_path = "./Resources/GlobalTemperatures.csv"
global_temp_df = pd.read_csv(file_path)
global_temp_df.head()

Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1750-01-01,3.034,3.574,,,,,,
1,1750-02-01,3.083,3.702,,,,,,
2,1750-03-01,5.626,3.076,,,,,,
3,1750-04-01,8.49,2.451,,,,,,
4,1750-05-01,11.573,2.072,,,,,,


In [3]:
# cast to to datetime
dt= lambda df_: pd.to_datetime(df_['dt'])

In [4]:
# Extract average Temp across United States since the begining of the 19th Century
world_temp_df = global_temp_df.loc[(global_temp_df['dt'] >= '1900-01-01')]
world_temp_df


Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
1800,1900-01-01,1.461,0.276,7.193,0.465,-4.102,0.395,13.142,0.142
1801,1900-02-01,3.098,0.416,9.181,0.604,-2.814,0.626,13.777,0.173
1802,1900-03-01,5.492,0.261,11.377,0.327,-0.680,0.610,14.400,0.141
1803,1900-04-01,8.223,0.292,13.972,0.342,2.131,0.394,15.170,0.151
1804,1900-05-01,11.385,0.357,17.415,0.329,5.179,0.379,15.955,0.159
...,...,...,...,...,...,...,...,...,...
3187,2015-08-01,14.755,0.072,20.699,0.110,9.005,0.170,17.589,0.057
3188,2015-09-01,12.999,0.079,18.845,0.088,7.199,0.229,17.049,0.058
3189,2015-10-01,10.801,0.102,16.450,0.059,5.232,0.115,16.290,0.062
3190,2015-11-01,7.433,0.119,12.892,0.093,2.157,0.106,15.252,0.063


In [5]:
# drop the null values
cleanWorld_temp_df = world_temp_df.dropna()
cleanWorld_temp_df.head()

Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
1800,1900-01-01,1.461,0.276,7.193,0.465,-4.102,0.395,13.142,0.142
1801,1900-02-01,3.098,0.416,9.181,0.604,-2.814,0.626,13.777,0.173
1802,1900-03-01,5.492,0.261,11.377,0.327,-0.68,0.61,14.4,0.141
1803,1900-04-01,8.223,0.292,13.972,0.342,2.131,0.394,15.17,0.151
1804,1900-05-01,11.385,0.357,17.415,0.329,5.179,0.379,15.955,0.159


In [6]:
# check the unique coloumn
cleanWorld_temp_df.nunique()


dt                                           1392
LandAverageTemperature                       1314
LandAverageTemperatureUncertainty             303
LandMaxTemperature                           1305
LandMaxTemperatureUncertainty                 380
LandMinTemperature                           1322
LandMinTemperatureUncertainty                 367
LandAndOceanAverageTemperature               1180
LandAndOceanAverageTemperatureUncertainty     117
dtype: int64

In [7]:
cleanWorld_temp_df.dtypes

dt                                            object
LandAverageTemperature                       float64
LandAverageTemperatureUncertainty            float64
LandMaxTemperature                           float64
LandMaxTemperatureUncertainty                float64
LandMinTemperature                           float64
LandMinTemperatureUncertainty                float64
LandAndOceanAverageTemperature               float64
LandAndOceanAverageTemperatureUncertainty    float64
dtype: object

In [8]:
cleanWorld_temp_df['LandAverageTemperature'].isnull().sum()


0

In [9]:
cleanWorld_temp_df['LandAverageTemperatureUncertainty'].value_counts()

0.087    20
0.064    19
0.078    16
0.077    16
0.068    14
         ..
0.169     1
0.156     1
0.366     1
0.160     1
0.130     1
Name: LandAverageTemperatureUncertainty, Length: 303, dtype: int64

In [10]:
cleanWorld_temp_df['LandAndOceanAverageTemperature'].isnull().sum()

0

In [11]:
cleanWorld_temp_df['LandAndOceanAverageTemperature'].value_counts(20)

16.846    0.002874
13.773    0.002155
13.522    0.002155
13.711    0.002155
17.049    0.002155
            ...   
13.932    0.000718
16.080    0.000718
16.946    0.000718
15.384    0.000718
14.774    0.000718
Name: LandAndOceanAverageTemperature, Length: 1180, dtype: float64

In [12]:
# Recode the columns names
cleanWorld_temp_df = cleanWorld_temp_df.rename(columns={"dt":"Date",
                                                     "LandAverageTemperature":"LandAvg_temp",
                                                     "LandAverageTemperatureUncertainty":"LandAvg_temp_Uncer",
                                                     "LandMaxTemperature":"LandMax_temp",
                                                     "LandMaxTemperatureUncertainty":"LandMax_tempUncer",
                                                     "LandMinTemperature":"LandMin_temp",
                                                     "LandMinTemperatureUncertainty":"LandMin_tempUncer",
                                                     "LandAndOceanAverageTemperature":"LandOceanAvg_temp",
                                                     "LandAndOceanAverageTemperatureUncertainty":"LandOceanAvg_tempUncer"}).set_index("Date")
cleanWorld_temp_df

Unnamed: 0_level_0,LandAvg_temp,LandAvg_temp_Uncer,LandMax_temp,LandMax_tempUncer,LandMin_temp,LandMin_tempUncer,LandOceanAvg_temp,LandOceanAvg_tempUncer
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1900-01-01,1.461,0.276,7.193,0.465,-4.102,0.395,13.142,0.142
1900-02-01,3.098,0.416,9.181,0.604,-2.814,0.626,13.777,0.173
1900-03-01,5.492,0.261,11.377,0.327,-0.680,0.610,14.400,0.141
1900-04-01,8.223,0.292,13.972,0.342,2.131,0.394,15.170,0.151
1900-05-01,11.385,0.357,17.415,0.329,5.179,0.379,15.955,0.159
...,...,...,...,...,...,...,...,...
2015-08-01,14.755,0.072,20.699,0.110,9.005,0.170,17.589,0.057
2015-09-01,12.999,0.079,18.845,0.088,7.199,0.229,17.049,0.058
2015-10-01,10.801,0.102,16.450,0.059,5.232,0.115,16.290,0.062
2015-11-01,7.433,0.119,12.892,0.093,2.157,0.106,15.252,0.063


In [13]:
# Format the columns.
cleanWorld_temp_df["LandAvg_temp"] = cleanWorld_temp_df["LandAvg_temp"].map("{:.2f}".format)
cleanWorld_temp_df["LandAvg_temp_Uncer"] = cleanWorld_temp_df["LandAvg_temp_Uncer"].map("{:.2f}".format)
cleanWorld_temp_df["LandMax_temp"] = cleanWorld_temp_df["LandMax_temp"].map("{:.2f}".format)
cleanWorld_temp_df["LandMax_tempUncer"] = cleanWorld_temp_df["LandMax_tempUncer"].map("{:.2f}".format)
cleanWorld_temp_df["LandMin_temp"] = cleanWorld_temp_df["LandMin_temp"].map("{:.2f}".format)
cleanWorld_temp_df["LandMin_tempUncer"] = cleanWorld_temp_df["LandMin_tempUncer"].map("{:.2f}".format)
cleanWorld_temp_df["LandOceanAvg_temp"] = cleanWorld_temp_df["LandOceanAvg_temp"].map("{:.2f}".format)
cleanWorld_temp_df["LandOceanAvg_tempUncer"] = cleanWorld_temp_df["LandOceanAvg_tempUncer"].map("{:.2f}".format)
cleanWorld_temp_df

Unnamed: 0_level_0,LandAvg_temp,LandAvg_temp_Uncer,LandMax_temp,LandMax_tempUncer,LandMin_temp,LandMin_tempUncer,LandOceanAvg_temp,LandOceanAvg_tempUncer
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1900-01-01,1.46,0.28,7.19,0.47,-4.10,0.40,13.14,0.14
1900-02-01,3.10,0.42,9.18,0.60,-2.81,0.63,13.78,0.17
1900-03-01,5.49,0.26,11.38,0.33,-0.68,0.61,14.40,0.14
1900-04-01,8.22,0.29,13.97,0.34,2.13,0.39,15.17,0.15
1900-05-01,11.38,0.36,17.42,0.33,5.18,0.38,15.96,0.16
...,...,...,...,...,...,...,...,...
2015-08-01,14.76,0.07,20.70,0.11,9.00,0.17,17.59,0.06
2015-09-01,13.00,0.08,18.84,0.09,7.20,0.23,17.05,0.06
2015-10-01,10.80,0.10,16.45,0.06,5.23,0.12,16.29,0.06
2015-11-01,7.43,0.12,12.89,0.09,2.16,0.11,15.25,0.06


In [14]:
# Save in Resources folder
cleanWorld_temp_df.to_csv(index=True)
filepath = Path('Resources/cleanWorld_temp.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
cleanWorld_temp_df.to_csv(filepath)