### Imports 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import scipy as sc
import sklearn as skl
plt.style.use('ggplot')

### Read Data

In [None]:
data = pd.read_csv(r"C:\Users\Admin\Dropbox\My PC (DESKTOP-7FDR3DB)\Downloads\weatherHistory.csv")
datac = data.copy()
data.head()

In [None]:
columns = data.columns.to_list()

### checking dtypes of columns  

In [None]:
data.dtypes

In [None]:
obj_columns = data.iloc[:, list(data.dtypes == 'object')].columns.to_list()
num_columns = data.iloc[:, list(data.dtypes != 'object')].columns.to_list()
obj_columns.remove('Formatted Date')

### Unique Values

In [None]:
data[obj_columns].nunique().to_dict()

### Checking for Missing values

In [None]:
data.isnull().sum()

there is 517 missing values in precip type which is very small compared to over data set so will drop them.

In [None]:
print('before shape :', data.shape)
data.dropna(inplace=True) 
datac = data.copy()
print('after shape :', data.shape)


### Manupalation in date data

In [None]:
data['date'] = data['Formatted Date'].apply(lambda x: np.datetime64(x.split('.000')[0])).dt.date
data['hour'] = data['Formatted Date'].apply(lambda x: np.datetime64(x.split('.000')[0])).dt.hour
data['month'] = data['Formatted Date'].apply(lambda x: np.datetime64(x.split('.000')[0])).dt.month
data['year'] = data['Formatted Date'].apply(lambda x: np.datetime64(x.split('.000')[0])).dt.year
data.drop(['Formatted Date'], axis=1, inplace=True)

In [None]:
data = data[['year', 'month', 'hour', 'date']+num_columns+obj_columns]
data = data.sort_values(by=['year', 'month', 'date'])
datac = data.copy()
data.head()

In [None]:
years = list(data['year'].unique())
for y in years: 
  print(y, ' :')
  for m in range(1,13):
    n = data[(data['year'] == y) & (data['month'] == m)].shape[0]
    print(f'   {m} : {n} records.')

### Scale data

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = []

scaler = MinMaxScaler()
data[num_columns] = scaler.fit_transform(data[num_columns].values)



### Distribution of Numeric columns over year

In [None]:
import plotly.express as px
pd.options.plotting.backend = "plotly"
from plotly.subplots import make_subplots

In [None]:
#f, axs = make_subplots(len(years), len(num_columns)))
for i,y in enumerate(years):
    f = data[data['year']==y][num_columns].plot(kind = 'hist', bins=13, facet_col="variable", title=str(y))
    f.show()

In [None]:
#f, axs = make_subplots(len(years), len(num_columns)))
for i,c in enumerate(num_columns):
    f = data[[c, 'year']].plot(kind = 'box', x='year', y= c)#, title=str(y))
    f.show()

### statisticle change over year

In [None]:
dd=[]
for y in years: 
  d = data[data['year']==y][num_columns].describe()
  d = d.drop(['count'], axis=0)
  d['year'] = y
  dd.append(d)

d = pd.concat(dd, axis=0)
d = d.reset_index()
d = d.groupby(['index','year']).sum()
#f, axs = plt.subplots(1,4, figsize=(30,10))
for i,p in enumerate(['std', 'min', 'mean', 'max']):
  f = d.loc[p].plot(kind='bar', title=p)
  #axs[i].set_title(p)
  f.show()


#f, axs = plt.subplots(1,3, figsize=(30,10))
for i,p in enumerate(['25%', '50%', '75%']):
  f = d.loc[p].plot(kind='bar', title=p)
  f.show()




### convering hourly data into daily data

In [None]:
d = data.copy()
num_d = d.groupby(['date'])[num_columns].mean()
obj_d = d.groupby(['date'])[obj_columns].agg(lambda x: x.value_counts().index[0])

In [None]:
daily_d = pd.concat([num_d, obj_d], axis=1)

### Analysis of Summary of weather

In [None]:
data['Daily Summary'].unique()

1. is there any massive change in continuous data over day over year.

In [None]:
d = data.copy()
std_num_day = d.groupby(['date'])[num_columns].std()
#std_num_day = d.groupby(['date'])[obj_columns].std()
std_num_day.index = pd.to_datetime(std_num_day.index, errors='coerce')

In [None]:
#std_num_day1  = std_num_day.groupby(pd.Grouper(freq='1M'))

In [None]:
#xticks=[i.date() for i in list(pd.date_range(start='2006-01-01', end='2016-12-31', freq='M'))]
std_num_day['year'] = pd. DatetimeIndex(std_num_day.index).year
for c in num_columns:
    f = std_num_day[[c, 'year']].plot(kind='line', color='year', title=c)
    f.show()
  #f, ax = plt.subplots(1,1,figsize=(50,10))
  #plt.setp(ax, 
        #xticks=xticks)
        
  #ax.set_xlim(xticks[0], xticks[-1])
  #f.suptitle(c)
  
    

2. is there any massive change in continuous data over month over year.

In [None]:
d = data.copy()
d = d.groupby(['month','year'])[num_columns].describe()
d.head()

In [None]:
for n in num_columns:
  dd = d[n]

  dd = dd.reset_index()
  dd['month'] = dd['month'].astype('str')
  dd['year'] = dd['year'].astype('str')
  dd['month_year'] = dd['month']+' - '+dd['year']

  px.line(dd, x='month_year', y=['mean',	'std',	'min',	'25%',	'50%',	'75%',	'max'], title=n).show()

3. categorical features analysis

In [None]:
obj_d.index = pd.to_datetime(obj_d.index)
Precip_Type_df = pd.DataFrame(obj_d['Precip Type'])
Summary_df = pd.DataFrame(obj_d['Summary'])
Daily_Summary_df = pd.DataFrame(obj_d['Daily Summary'])

obj_d_Precip_Type_month = pd.get_dummies(Precip_Type_df, prefix=[''], prefix_sep='').groupby(pd.Grouper(freq='1M')).sum()
obj_d_Summary_month = pd.get_dummies(Summary_df, prefix=[''], prefix_sep='').groupby(pd.Grouper(freq='1M')).sum()
obj_d_Daily_Summary_month = pd.get_dummies(Daily_Summary_df, prefix=[''], prefix_sep='').groupby(pd.Grouper(freq='1M')).sum()

obj_d_Precip_Type_month['month'] = pd.to_datetime(obj_d_Precip_Type_month.index)
obj_d_Precip_Type_month['month'] = obj_d_Precip_Type_month['month'].apply(lambda x: x.strftime('%B-%Y'))

obj_d_Summary_month['month'] = pd.to_datetime(obj_d_Precip_Type_month.index)
obj_d_Summary_month['month'] = obj_d_Summary_month['month'].apply(lambda x: x.strftime('%B-%Y'))


obj_d_Daily_Summary_month['month'] = pd.to_datetime(obj_d_Precip_Type_month.index)
obj_d_Daily_Summary_month['month'] = obj_d_Daily_Summary_month['month'].apply(lambda x: x.strftime('%B-%Y'))




obj_d_Precip_Type_month = obj_d_Precip_Type_month.plot.line(x='month', y=list(Precip_Type_df['Precip Type'].unique()))
obj_d_Summary_month = obj_d_Summary_month.plot.line(x='month', y=list(Summary_df['Summary'].unique()))
obj_d_Daily_Summary_month = obj_d_Daily_Summary_month.plot.line(x='month', y=list(Daily_Summary_df['Daily Summary'].unique()))
#obj_d_Summary_month = Summary_df
#obj_d_Precip_Type_month = Daily_Summary_df

In [None]:
obj_d_Precip_Type_month.show(), obj_d_Summary_month.show(), obj_d_Daily_Summary_month.show()

### Year Year categorical Analysis

In [None]:
obj_d.index = pd.to_datetime(obj_d.index)
Precip_Type_df = pd.DataFrame(obj_d['Precip Type'])
Summary_df = pd.DataFrame(obj_d['Summary'])
Daily_Summary_df = pd.DataFrame(obj_d['Daily Summary'])

obj_d_Precip_Type_year = pd.get_dummies(Precip_Type_df, prefix=[''], prefix_sep='').groupby(pd.Grouper(freq='1Y')).sum()
obj_d_Summary_year = pd.get_dummies(Summary_df, prefix=[''], prefix_sep='').groupby(pd.Grouper(freq='1Y')).sum()
obj_d_Daily_Summary_year = pd.get_dummies(Daily_Summary_df, prefix=[''], prefix_sep='').groupby(pd.Grouper(freq='1Y')).sum()

obj_d_Precip_Type_year['month'] = pd.to_datetime(obj_d_Precip_Type_year.index)
obj_d_Precip_Type_year['month'] = obj_d_Precip_Type_year['month'].apply(lambda x: x.strftime('%Y'))

obj_d_Summary_year['month'] = pd.to_datetime(obj_d_Precip_Type_year.index)
obj_d_Summary_year['month'] = obj_d_Summary_year['month'].apply(lambda x: x.strftime('%Y'))


obj_d_Daily_Summary_year['month'] = pd.to_datetime(obj_d_Precip_Type_year.index)
obj_d_Daily_Summary_year['month'] = obj_d_Daily_Summary_year['month'].apply(lambda x: x.strftime('%Y'))



obj_d_Precip_Type_year = obj_d_Precip_Type_year.plot.line(x='month', y=list(Precip_Type_df['Precip Type'].unique()))
obj_d_Summary_year = obj_d_Summary_year.plot.line(x='month', y=list(Summary_df['Summary'].unique()))
obj_d_Daily_Summary_year = obj_d_Daily_Summary_year.plot.line(x='month', y=list(Daily_Summary_df['Daily Summary'].unique()))
#obj_d_Summary_year = Summary_df
#obj_d_Precip_Type_year = Daily_Summary_df

In [None]:
obj_d_Precip_Type_year.show(), obj_d_Summary_year.show(), obj_d_Daily_Summary_year.show()

In [None]:
d = data.copy()

### Categorical to continuous analysis

#### Precip Typr to Numeric Analysis

In [None]:
d = data.copy()
d.index = pd.to_datetime(d['date'])
d_num = d.groupby(pd.Grouper(freq='15D'))[num_columns].mean()
d_obj = d.groupby(pd.Grouper(freq='15D'))[obj_columns].agg(lambda x: x.value_counts().index[0])
d = pd.concat([d_num, d_obj], axis=1)
d.reset_index(inplace=True)
d['month_year'] = d['date'].apply(lambda x: x.strftime('%B-%Y'))
d

In [None]:
precip_type_num = d.groupby('Precip Type')[num_columns+['month_year']]

for n, d in precip_type_num:
  px.line(d, x='month_year', y=num_columns, title=n).show()

#### Summary to Numeric Analysis

In [None]:

precip_type_num = d.groupby('Summary')[num_columns+['month_year']]

for n, d in precip_type_num:
  px.line(d, x='month_year', y=num_columns, title=n).show()

#### Daily Summary to Numeric Analysis

In [None]:
#d['month_year'] = data['date'].apply(lambda x: x.strftime('%B-%Y'))
precip_type_num = d.groupby('Daily Summary')[num_columns+['month_year']]

for n, d in precip_type_num:
  px.line(d, x='month_year', y=num_columns, title=n).show()

### Decompose Numeric columns

In [None]:
d = data.copy()
d.index = pd.to_datetime(d['date'])
dd_mean = d.resample('M').mean()

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
plt.rcParams['figure.figsize'] = [20, 10]
for col in num_columns:
    r =seasonal_decompose(dd_mean[col], model='Additive', period=12)
    r.plot()

### Numeric Features Test

In [None]:
d = data.copy()
d.index = pd.to_datetime(d['date'])
print('before :', d.shape)
dd_mean = d.resample('M').mean()
dd_std = d.resample('M').std()
print('after :', dd_std.shape)

In [None]:
dd_mean = dd_mean[num_columns]

In [None]:
dd_mean

In [None]:
# define Dickey-Fuller Test (DFT) function
# Null is that unit root is present, rejection means likely stationary
import statsmodels.tsa.stattools as ts
from statsmodels.tsa.stattools import adfuller


def dftest(timeseries):
    dftest = ts.adfuller(timeseries,)
    dfoutput = pd.Series(dftest[0:4], 
                         index=['Test Statistic','p-value','Lags Used','Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    
    if dfoutput['p-value'] <= 0.05:
        print(f'{timeseries.name} The series is stationary')
    else:
        print(f'{timeseries.name} The series is NOT stationary')
    
    #Determing rolling statistics
    rolmean = timeseries.rolling(window=12).mean()
    rolstd = timeseries.rolling(window=12).std()

    d = pd.concat([timeseries, rolmean, rolstd], axis=1)
    s = timeseries.name
    d.columns = [s, s+' rolling_mean', s+' rolling_std']
    d.plot().update_layout(title=timeseries.name).show()


In [None]:
dd_mean.dropna(axis=0, inplace=True)
dd_std.dropna(axis=0, inplace=True)

In [None]:
for col in num_columns:
    print(f'______________________________ {col} ________________________________')
    dftest(dd_mean[col])
    print()

In [None]:
import statsmodels.api as sm

In [None]:
# for determinr seaosnal order
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import warnings
warnings.filterwarnings("ignore")
for col in num_columns:
# P 
    f,ax = plt.subplots(1, 2, figsize=(30,10))
    plot_pacf(dd_mean[col], lags =12, ax=ax[0])
    
    
# Q
    plot_acf(dd_mean[col], lags =12, ax=ax[1])
    f.suptitle(col, fontsize = 25)
    f.show()
  

### Forecast for next year

In [None]:
#Introducing another model
sar2 = sm.tsa.statespace.SARIMAX(dd_mean['Temperature (C)'].iloc[:90,], 
                                order=(3,1,1),
                                seasonal_order=(12,1,8,12), 
                                trend='c').fit()

In [None]:
sar2.summary()

In [None]:
dd_mean['forecast'] = sar2.predict(start = 91, end= 150, dynamic=False)  
px.line(dd_mean, y=['Temperature (C)', 'forecast'], title='Temperature Forecast').show()