In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

import seaborn as sns

df = pd.read_csv(r'D:\Augustana University\Courses\5. Spring 2020\COSC 380-A Artificial Intelligence & Robotics\Projects\Final Project\Final Final\weatherAUS.csv')

In [None]:
df.shape

In [None]:
df.head(10)

**Date**: The date of observation<br/>
**Location:** The common name of the location of the weather station<br/>
**MinTemp:** The minimum temperature in degrees celsius<br/>
**MaxTemp:** The maximum temperature in degrees celsius<br/>
**Rainfall:** The amount of rainfall recorded for the day in mm<br/>
**Evaporation:** The so-called Class A pan evaporation (mm) in the 24 hours to 9am<br/>
**Sunshine:** The number of hours of bright sunshine in the day.<br/>
**WindGustDir:** The direction of the strongest wind gust in the 24 hours to midnight<br/>
**WindGustSpeed:** The speed (km/h) of the strongest wind gust in the 24 hours to midnight<br/>
**WindDir9am:** Direction of the wind at 9am<br/>
**WindDir3pm:** Direction of the wind at 3pm<br/>
**WindSpeed9am:** Wind speed (km/hr) averaged over 10 minutes prior to 9am<br/>
**WindSpeed3pm:** Wind speed (km/hr) averaged over 10 minutes prior to 3pm<br/>
**Humidity9am:** Humidity (percent) at 9am<br/>
**Humidity3pm:** Humidity (percent) at 3pm<br/>
**Pressure9am:** Atmospheric pressure (hpa) reduced to mean sea level at 9am<br/>
**Pressure3pm:** Atmospheric pressure (hpa) reduced to mean sea level at 3pm<br/>
**Cloud9am:** Fraction of sky obscured by cloud at 9am. This is measured in oktas, which are a unit of eigths. It records how many<br/>
**Cloud3pm:** Fraction of sky obscured by cloud at 3pm. This is measured in oktas, which are a unit of eigths. It records how many<br/>
**Temp9am:** Temperature (degrees C) at 9am<br/>
**Temp3pm:** Temperature (degrees C) at 9am<br/>
**RainToday:** Boolean: 1 if precipitation (mm) in the 24 hours to 9am exceeds 1mm, otherwise 0<br/>
**RISK_MM:** The amount of next day rain in mm. Used to create response variable RainTomorrow. A kind of measure of the "risk".<br/>
**RainTomorrow:** The target variable. Did it rain tomorrow?<br/>

In [None]:
df.describe()

In [None]:
df.drop(labels = ['Date', 'Location', 'RISK_MM'], axis = 1, inplace = True)

In [None]:
# Replacing Yes by 1 and No by 0 for RainToday and RainTomorrow columns

df['RainToday'].replace({'No': 0, 'Yes': 1}, inplace = True)
df['RainTomorrow'].replace({'No': 0, 'Yes': 1}, inplace = True)

In [None]:
df['WindGustDir'].unique()

In [None]:
df['WindGustDir'].replace({'W': 1, 'WNW': 2, 'WSW': 3, 'NE': 4, 'NNW':5, 'N':6, 'NNE': 7, 'SW': 8,
                          'ENE':9, 'SSE': 10, 'S': 11, 'NW': 12, 'SE':13, 'ESE':14,
                          'E': 15, 'SSW':16}, inplace = True)

In [None]:
import seaborn as sns

f, ax = plt.subplots(figsize=(20, 15))
corr = df.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax, annot = True)

In [None]:
sns.pairplot(df)

In [None]:
categorical = ['WindDir3pm', 'WindDir9am']

In [None]:
# Adding dummy variables for categorical predictors

df = pd.get_dummies(df, columns = categorical, drop_first = True)

In [None]:
# Getting rid of nan values

df = df.dropna(how = 'any')
df.shape

In [None]:
from sklearn import preprocessing

scale = preprocessing.MinMaxScaler()
scale.fit(df)

df = pd.DataFrame(scale.transform(df), index = df.index, columns = df.columns)

In [None]:
X = df.drop(labels = ['RainTomorrow'], axis = 1)
X

In [None]:
y = df['RainTomorrow']
y

In [None]:
# Splitting the dataset
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
X_val.shape

In [None]:
X_test.shape

In [None]:
# Dimension Reduction to keep 95% of the variance. Reducing 16 variables!

from sklearn.decomposition import PCA

pca = PCA(n_components = 0.95)
X_train = pca.fit_transform(X_train)

X_val = pca.transform(X_val)
X_test = pca.transform(X_test)

In [None]:
X_train.shape

In [None]:
X_val.shape

In [None]:
X_test.shape

# Models

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='liblinear', random_state=0)

logreg.fit(X_train, y_train)

In [None]:
y_pred_test = logreg.predict(X_test)

y_pred_test

In [None]:
# probability of no rain (0)

logreg.predict_proba(X_test)[:,0]

In [None]:
# probability of rain (1)

logreg.predict_proba(X_test)[:,1]

In [None]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_test)))

In [None]:
y_pred_train = logreg.predict(X_train)

print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))

In [None]:
print('Training set score: {:.4f}'.format(logreg.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(logreg.score(X_test, y_test)))

# Both values are comparable so, no overfitting.

In [None]:
logreg100 = LogisticRegression(C=100, solver='liblinear', random_state=0)

logreg100.fit(X_train, y_train)

In [None]:
print('Training set score: {:.4f}'.format(logreg100.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(logreg100.score(X_test, y_test)))

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_test)

cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_test))

In [None]:
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]

classification_accuracy = (TP + TN) / float(TP + TN + FP + FN)

print('Classification accuracy : {0:0.4f}'.format(classification_accuracy))

In [None]:
classification_error = (FP + FN) / float(TP + TN + FP + FN)

print('Classification error : {0:0.4f}'.format(classification_error))

In [None]:
precision = TP / float(TP + FP)

print('Precision : {0:0.4f}'.format(precision))

In [None]:
recall = TP / float(TP + FN)

print('Recall or Sensitivity : {0:0.4f}'.format(recall))

In [None]:
true_positive_rate = TP / float(TP + FN)

print('True Positive Rate : {0:0.4f}'.format(true_positive_rate))

In [None]:
false_positive_rate = FP / float(FP + TN)

print('False Positive Rate : {0:0.4f}'.format(false_positive_rate))

In [None]:
specificity = TN / (TN + FP)

print('Specificity : {0:0.4f}'.format(specificity))

In [None]:
y_pred_prob = logreg.predict_proba(X_test)[0:10]

y_pred_prob

In [None]:
logreg.predict_proba(X_test)[0:10, 1]

In [None]:
y_pred1 = logreg.predict_proba(X_test)[:, 1]

In [None]:
plt.rcParams['font.size'] = 12
plt.hist(y_pred1, bins = 10)
plt.title('Histogram of predicted probabilities of rain')
plt.xlim(0,1)
plt.xlabel('Predicted probabilities of rain')
plt.ylabel('Frequency')

In [None]:
# plot ROC Curve

from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred1)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, linewidth=2)
plt.plot([0,1], [0,1], 'k--' )
plt.rcParams['font.size'] = 12
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

ROC_AUC = roc_auc_score(y_test, y_pred1)
print('ROC AUC : {:.4f}'.format(ROC_AUC))

In [None]:
from sklearn.model_selection import cross_val_score

Cross_validated_ROC_AUC = cross_val_score(logreg, X_train, y_train, cv=5, scoring='roc_auc').mean()
print('Cross validated ROC AUC : {:.4f}'.format(Cross_validated_ROC_AUC))

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(logreg, X_train, y_train, cv = 5, scoring='accuracy')
print('Cross-validation scores:{}'.format(scores))

In [None]:
print('Average cross-validation score: {:.4f}'.format(scores.mean()))

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = [{'penalty':['l1','l2']}, 
              {'C':[1, 10, 100, 1000]}]
grid_search = GridSearchCV(estimator = logreg,  
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5,
                           verbose=0)
grid_search.fit(X_train, y_train)

In [None]:
print('GridSearch CV best score : {:.4f}\n\n'.format(grid_search.best_score_))

print(grid_search.best_params_)

print('\n\nEstimator that was chosen by the search :','\n\n', (grid_search.best_estimator_))

In [None]:
print('GridSearch CV score on test set: {0:0.4f}'.format(grid_search.score(X_test, y_test)))

## Support Vector Machine

In [None]:
from sklearn import svm

model_svm = clf_svc = svm.SVC()
model_svm.fit(X_train, y_train)

In [None]:
X_train_pred = model_svm.predict(X_train)
confusion_matrix(y_train, X_train_pred)

In [None]:
accuracy_svm_train = accuracy_score(y_train, X_train_pred)
print("accuracy on training set: ", accuracy_svm_train)

In [None]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  
  
best_model_svm = GridSearchCV(model_svm, param_grid, verbose = 3) 

best_model_svm.fit(X_val, y_val)

In [None]:
best_model_svm.best_params_

In [None]:
X_val_pred = best_model_svm.predict(X_val)
confusion_matrix(y_val, X_val_pred)

In [None]:
accuracy_svm_val = accuracy_score(y_val, X_val_pred)
print("best model's accuracy on validation set: ", accuracy_svm_val)

In [None]:
X_test_pred = best_model_svm.predict(X_test)
confusion_matrix(y_test, X_test_pred)

In [None]:
accuracy_svm_test = accuracy_score(y_test, X_test_pred)
print("accuracy on test set: ", accuracy_svm_test)

In [None]:
models = ('Train', 'Test')
y_pos = np.arange(len(models))
accuracy = [accuracy_svm_train*100, accuracy_svm_test*100]

plt.bar(y_pos, accuracy, align='center', alpha=0.5, color = 'black')
plt.xticks(y_pos, models)
plt.ylabel('Accuracy')
plt.title('Support Vector Machine')
plt.grid(True)

plt.show()

## Gradient Boost Classifier

In [None]:
model_gb = GradientBoostingClassifier()
model_gb.fit(X_train, y_train)

In [None]:
X_train_pred = model_gb.predict(X_train)
confusion_matrix(y_train, X_train_pred)

In [None]:
accuracy_gb_train = accuracy_score(y_train, X_train_pred)
print("accuracy on training set: ", accuracy_gb_train)

In [None]:
params = {'learning_rate': [0.001, 0.01, 0.1, 10, 100],
         'max_depth': [10, 100, 1000],
         'n_estimators': [1, 10, 100]
         }

best_model_gb = GridSearchCV(model_gb, params, verbose = 3) 

best_model_gb.fit(X_val, y_val)

In [None]:
best_model_gb.best_params_

In [None]:
X_val_pred = best_model_gb.predict(X_val)
confusion_matrix(y_val, X_val_pred)

In [None]:
accuracy_gb_val = accuracy_score(y_val, X_val_pred)
print("best model's accuracy on validation set: ", accuracy_gb_val)

In [None]:
X_test_pred = best_model_gb.predict(X_test)
confusion_matrix(y_test, X_test_pred)

In [None]:
accuracy_gb_test = accuracy_score(y_test, X_test_pred)
print("accuracy on test set: ", accuracy_gb_test)

In [None]:
models = ('Train', 'Test')
y_pos = np.arange(len(models))
accuracy = [accuracy_gb_train*100, accuracy_gb_test*100]

plt.bar(y_pos, accuracy, align='center', alpha=0.5, color = 'black')
plt.xticks(y_pos, models)
plt.ylabel('Accuracy')
plt.title('Gradient Boost')
plt.grid(True)

plt.show()