# 2.3 Reducing Dimensionality for Climate Data

## Contents
### 1. Import libraries and Data
### 2. Dimensional Reduction for Weather Data
### 3. Incorporating "Pleasant Weather" Dataframe


## 1. Import Libraries and Data

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import operator
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from matplotlib.pyplot import figure

In [8]:
#Create a path to where your data is stored.
path = r'/Users/sydneyjohnson/Documents/CF Data Analytics Course/11-2024 ClimateWins Analysis/02 Data'

In [9]:
df = pd.read_csv(os.path.join(path, 'scaled2010s.csv'))

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,BASEL_temp_mean,BELGRADE_temp_mean,BUDAPEST_temp_mean,DEBILT_temp_mean,DUSSELDORF_temp_mean,HEATHROW_temp_mean,KASSEL_temp_mean,LJUBLJANA_temp_mean,MAASTRICHT_temp_mean,MADRID_temp_mean,MUNCHENB_temp_mean,OSLO_temp_mean,SONNBLICK_temp_mean,STOCKHOLM_temp_mean,VALENTIA_temp_mean
0,0,-1.128475,-0.453788,-0.906738,-1.965574,-1.805708,-1.795894,-2.467463,-0.611718,-1.917968,-1.308362,-0.941898,-1.817358,-0.8259,-1.964022,-9.294301
1,1,-1.720538,-1.160884,-1.295436,-1.885796,-1.864336,-1.848122,-2.698046,-1.139683,-1.932852,-1.384855,-1.65859,-2.222343,-1.892107,-2.12674,-10.236726
2,2,-1.94929,-1.613867,-1.695566,-2.332553,-2.098851,-1.865531,-2.614198,-1.499658,-2.200764,-1.282864,-1.799369,-2.394154,-2.563964,-2.189323,-10.550868
3,3,-2.178042,-1.547577,-1.947076,-2.172997,-2.245422,-2.509679,-3.284984,-1.835635,-2.424025,-1.027886,-1.965744,-2.038259,-2.125797,-1.776272,-7.200024
4,4,-2.056938,-1.414996,-1.878482,-1.821973,-2.040222,-2.405222,-3.829998,-1.61965,-1.96262,-1.002388,-2.196109,-2.394154,-1.556179,-2.852709,-8.666018


In [13]:
df = df.drop(columns = ['Unnamed: 0'])

In [15]:
df.head()

Unnamed: 0,BASEL_temp_mean,BELGRADE_temp_mean,BUDAPEST_temp_mean,DEBILT_temp_mean,DUSSELDORF_temp_mean,HEATHROW_temp_mean,KASSEL_temp_mean,LJUBLJANA_temp_mean,MAASTRICHT_temp_mean,MADRID_temp_mean,MUNCHENB_temp_mean,OSLO_temp_mean,SONNBLICK_temp_mean,STOCKHOLM_temp_mean,VALENTIA_temp_mean
0,-1.128475,-0.453788,-0.906738,-1.965574,-1.805708,-1.795894,-2.467463,-0.611718,-1.917968,-1.308362,-0.941898,-1.817358,-0.8259,-1.964022,-9.294301
1,-1.720538,-1.160884,-1.295436,-1.885796,-1.864336,-1.848122,-2.698046,-1.139683,-1.932852,-1.384855,-1.65859,-2.222343,-1.892107,-2.12674,-10.236726
2,-1.94929,-1.613867,-1.695566,-2.332553,-2.098851,-1.865531,-2.614198,-1.499658,-2.200764,-1.282864,-1.799369,-2.394154,-2.563964,-2.189323,-10.550868
3,-2.178042,-1.547577,-1.947076,-2.172997,-2.245422,-2.509679,-3.284984,-1.835635,-2.424025,-1.027886,-1.965744,-2.038259,-2.125797,-1.776272,-7.200024
4,-2.056938,-1.414996,-1.878482,-1.821973,-2.040222,-2.405222,-3.829998,-1.61965,-1.96262,-1.002388,-2.196109,-2.394154,-1.556179,-2.852709,-8.666018


## 2. Dimensional Reduction for Weather Data

In [25]:
#Run the PCA model, reducing to 5 components
pca = PCA(n_components=5)
principalComponents = pca.fit_transform(df)

In [27]:
principalComponents

array([[ 5.87007731e+00,  9.01537531e+00, -1.65470344e+00,
         9.04292223e-01, -1.85806686e-01],
       [ 7.24853181e+00,  9.67372896e+00, -2.16779564e+00,
         5.36729192e-03, -4.38493467e-01],
       [ 8.16618443e+00,  9.76642596e+00, -2.50001438e+00,
        -3.11630178e-01, -8.00412309e-01],
       ...,
       [ 4.37314927e+00, -8.66348110e-01, -6.05506850e-01,
        -5.16826631e-01, -1.16435535e+00],
       [ 3.68968344e+00, -8.56339516e-01, -6.52600904e-01,
        -8.65416570e-01, -9.76940615e-01],
       [ 3.71238013e+00, -7.28869776e-01, -5.31411394e-01,
        -2.26759883e-01, -3.96339585e-01]])

In [29]:
#Make a dataframe of the new components
dfPCA = pd.DataFrame(principalComponents, columns=['PCA1', 'PCA2', 'PCA3', 'PCA4', 'PCA5'])

In [31]:
dfPCA

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5
0,5.870077,9.015375,-1.654703,0.904292,-0.185807
1,7.248532,9.673729,-2.167796,0.005367,-0.438493
2,8.166184,9.766426,-2.500014,-0.311630,-0.800412
3,8.159491,6.864394,-0.600067,-0.239239,-0.991286
4,8.201138,8.505246,-0.715600,-0.351653,-0.004670
...,...,...,...,...,...
3647,3.515132,-0.567631,-0.514356,0.033043,0.437615
3648,4.416007,-0.723981,-0.535287,0.038639,-0.363071
3649,4.373149,-0.866348,-0.605507,-0.516827,-1.164355
3650,3.689683,-0.856340,-0.652601,-0.865417,-0.976941


In [47]:
# Export to csv
dfPCA.to_csv(os.path.join(path, 'PCA5_2010s.csv'))

## 3. Incorporating "pleasant weather" dataframe

In [10]:
dfpls = pd.read_csv(os.path.join(path, 'Dataset-Answers-Weather_Prediction_Pleasant_Weather.csv'))

In [39]:
dfpls

Unnamed: 0,DATE,BASEL_pleasant_weather,BELGRADE_pleasant_weather,BUDAPEST_pleasant_weather,DEBILT_pleasant_weather,DUSSELDORF_pleasant_weather,HEATHROW_pleasant_weather,KASSEL_pleasant_weather,LJUBLJANA_pleasant_weather,MAASTRICHT_pleasant_weather,MADRID_pleasant_weather,MUNCHENB_pleasant_weather,OSLO_pleasant_weather,SONNBLICK_pleasant_weather,STOCKHOLM_pleasant_weather,VALENTIA_pleasant_weather
0,19600101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,19600102,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,19600103,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,19600104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,19600105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22945,20221027,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22946,20221028,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22947,20221029,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22948,20221030,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
# Reduce your dataset to the 2010s
pls2010s = dfpls[dfpls['DATE'].astype(str).str.contains('^201')] #<-----INSERT YEAR HERE
pls2010s

Unnamed: 0,DATE,BASEL_pleasant_weather,BELGRADE_pleasant_weather,BUDAPEST_pleasant_weather,DEBILT_pleasant_weather,DUSSELDORF_pleasant_weather,HEATHROW_pleasant_weather,KASSEL_pleasant_weather,LJUBLJANA_pleasant_weather,MAASTRICHT_pleasant_weather,MADRID_pleasant_weather,MUNCHENB_pleasant_weather,OSLO_pleasant_weather,SONNBLICK_pleasant_weather,STOCKHOLM_pleasant_weather,VALENTIA_pleasant_weather
18263,20100101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
18264,20100102,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
18265,20100103,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
18266,20100104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
18267,20100105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21910,20191227,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21911,20191228,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21912,20191229,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21913,20191230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [43]:
#Run the PCA model, reducing to 1 components
pca = PCA(n_components=1)
principalComponents = pca.fit_transform(pls2010s)

In [45]:
#Make a dataframe of the new components
plsPCA = pd.DataFrame(principalComponents, columns=['Pleasant'])
plsPCA

Unnamed: 0,Pleasant
0,-45564.34447
1,-45563.34447
2,-45562.34447
3,-45561.34447
4,-45560.34447
...,...
3647,45561.65553
3648,45562.65553
3649,45563.65553
3650,45564.65553


In [18]:
pls2010s.stack().value_counts()

0           42334
1           12446
20160822        1
20160824        1
20160825        1
            ...  
20130504        1
20130505        1
20130506        1
20130507        1
20191231        1
Name: count, Length: 3654, dtype: int64

In [22]:
(12446)/(12446+42334)

0.2271997079225995