In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
# Define the order of months
month_order = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']

In [None]:
fso4_df = pd.read_csv('/home/krishna/UEInfo/SO2/data/so2_fso4.csv')
fso4_df['month'] = pd.Categorical(fso4_df['month'], categories=month_order, ordered=True)

In [None]:
fso4_df[['rh','so2','fso4']].describe()

In [None]:
fso4_df[['rh','so2','fso4']].corr()

# Insights (Entire data)
1. fso4 is not at all correlated with rh
2. fso4 has good correlation with so2. 

In [None]:
fso4_df[fso4_df.month=='JAN'][['rh','so2','fso4']].corr()

In [None]:
fso4_df[fso4_df.month=='MAY'][['rh','so2','fso4']].corr()

In [None]:
fso4_df[(fso4_df.ix==91)&(fso4_df.iy==69)][['rh','log_so2','fso4']].corr()

# Insights (Monthly data)
1. fso4 is having some correlation with rh in a few months
2. fso4 has good correlation with so2

In [None]:
fso4_monthly_means = fso4_df.groupby('month')[['rh','so2','fso4']].mean()
fso4_monthly_means

In [None]:
fso4_df.iy.astype('str')

In [None]:
plt.figure()
plt.plot(fso4_monthly_means.index,fso4_monthly_means.rh, marker='o', label='RH')
plt.plot(fso4_monthly_means.index,fso4_monthly_means.fso4*100, marker='o', label='fso4*100')
plt.plot(fso4_monthly_means.index,fso4_monthly_means.so2, marker='o', label='so2')

plt.xlabel('Month')
plt.ylabel('Values')
plt.title('Monthly Averages of fso4, so2, and rh')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Create a histogram
plt.hist(fso4_df['fso4'], bins=10, edgecolor='black')
plt.xlabel('Variable')
plt.ylabel('Frequency')
plt.title('Histogram of fso4')
plt.grid(True)
plt.show()

In [None]:
# Create a histogram
plt.hist(fso4_df['so2'], bins=10, edgecolor='black')
plt.xlabel('Variable')
plt.ylabel('Frequency')
plt.title('Histogram of so2')
plt.grid(True)
plt.show()

In [None]:
# Create a histogram
plt.hist(fso4_df['rh'], bins=10, edgecolor='black')
plt.xlabel('Variable')
plt.ylabel('Frequency')
plt.title('Histogram of rh')
plt.grid(True)
plt.show()

# Outliers

In [None]:
# Create a box plot
plt.boxplot( np.log(fso4_df['so2']))

plt.ylabel('so2')
plt.title('Box Plot for Outlier Detection')
plt.show()


In [None]:
fso4_df['log_so2'] = np.log(fso4_df['so2']+0.0001)

In [None]:
fso4_df[['rh','log_so2','fso4']].corr()

In [None]:
fso4_df[fso4_df.month=='JAN'][['rh','log_so2','fso4']].corr()

In [None]:
upper_limit = fso4_df.log_so2.mean() + 3*fso4_df.log_so2.std()
lower_limit = fso4_df.log_so2.mean() - 3*fso4_df.log_so2.std()

In [None]:
fso4_df_removedoutliers = fso4_df[(fso4_df.log_so2<upper_limit)&(fso4_df.log_so2>lower_limit)]

In [None]:
fso4_df_removedoutliers[['rh','log_so2','fso4']].corr()

In [None]:
fso4_df

In [None]:
fso4_df['rh_nextmonth'] = fso4_df['rh'].shift(1)

# Machine Learning

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

from math import sqrt

In [4]:
fso4_df = pd.read_csv('/home/krishna/UEInfo/SO2/data/so2_fso4.csv')
fso4_df['month'] = pd.Categorical(fso4_df['month'], categories=month_order, ordered=True)

fso4_df['ix'] = fso4_df.ix.astype('str')
fso4_df['iy'] = fso4_df.iy.astype('str')
fso4_df['loc'] = fso4_df['ix']+fso4_df['iy']

# There are outliers in so2 column
fso4_df['log_so2'] = np.log(fso4_df['so2']+0.0001)
#Remove remaining outliers
upper_limit = fso4_df.log_so2.mean() + 3*fso4_df.log_so2.std()
lower_limit = fso4_df.log_so2.mean() - 3*fso4_df.log_so2.std()
fso4_df_removedoutliers = fso4_df[(fso4_df.log_so2<upper_limit)&(fso4_df.log_so2>lower_limit)]

In [5]:
X_train = fso4_df[['log_so2', 'time_category','rh','month','loc']]
X_train = pd.get_dummies(X_train, columns = ['month','loc'
                                             ,'time_category'],
                drop_first=True)

y_train = fso4_df.fso4

In [6]:
# TEST
fso4_2020_df = pd.read_csv('/home/krishna/UEInfo/SO2/data/so2_fso4_2020.csv')
fso4_2020_df['month'] = pd.Categorical(fso4_2020_df['month'], categories=month_order, ordered=True)

fso4_2020_df['ix'] = fso4_2020_df.ix.astype('str')
fso4_2020_df['iy'] = fso4_2020_df.iy.astype('str')
fso4_2020_df['loc'] = fso4_2020_df['ix']+fso4_2020_df['iy']

# There are outliers in so2 column
fso4_2020_df['log_so2'] = np.log(fso4_2020_df['so2']+0.0001)

X_test = fso4_2020_df[['log_so2', 'time_category','rh','month','loc']]
X_test = pd.get_dummies(X_test, columns = ['month','loc','time_category'],
                drop_first=True)

y_test = fso4_2020_df.fso4

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train,y_train)

In [None]:
# mse = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=10)
# print('MSE:', mse)

In [None]:
y_pred = model.predict(X_test)

In [None]:
rmse = sqrt(mean_squared_error(y_pred, y_test))
print('RMSE: ',rmse)

In [None]:
r2 = r2_score(y_pred,y_test)
print('R2: ',r2)

In [None]:
y_pred.shape

In [None]:
fso4_2020_df['fso4_predicted'] = y_pred

In [14]:
fso4_2020_df.to_csv('/home/krishna/UEInfo/SO2/data/so2_fso4_2020_predicted.csv')

In [10]:
regression = LinearRegression()
regression.fit(X_train,y_train)

LinearRegression()

In [11]:
# Prediction
y_pred = regression.predict(X_test)


In [13]:
fso4_2020_df['fso4_predicted_lr'] = y_pred

In [16]:
#Get the intercept and coefficients
intercept = regression.intercept_
coefficients = regression.coef_

coef_df = pd.DataFrame(zip(X_train.columns, coefficients))
coef_df.columns = ['var', 'coef']

In [17]:
coef_df

Unnamed: 0,var,coef
0,log_so2,-0.103915
1,rh,-0.004606
2,month_FEB,0.006795
3,month_MAR,-0.019861
4,month_APR,-0.017905
...,...,...
1857,loc_9996,0.012607
1858,loc_9997,0.016598
1859,loc_9998,0.045911
1860,loc_9999,0.064938


In [18]:
intercept

0.9998408466480354