In [18]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.metrics as metrics

In [19]:
df=pd.read_csv(r'/home/tom/Python/Machine learning/Weather_Data.csv')
df.head()
df.columns

Index(['Date', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

In [20]:
# PreProcessing : Another way of replacing aplphabets with numbers (ignore this cell)

from sklearn import preprocessing

df_array = df.values

YesOrNo = preprocessing.LabelEncoder()

YesOrNo.fit(['No', 'Yes'])

df_array[:, -2] = YesOrNo.transform(df_array[:, -2]) # RainToday column


In [21]:
# Preprocessing method 2 (use this cell)

df_sydney_processed = pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])
df_sydney_processed.replace(['No', 'Yes'], [0,1], inplace=True)


In [22]:
df_sydney_processed.drop('Date',axis=1,inplace=True)
df_sydney_processed = df_sydney_processed.astype(float)
features = df_sydney_processed.drop(columns='RainTomorrow', axis=1) # For X values ?
Y = df_sydney_processed['RainTomorrow']  #Y values


In [23]:
# Multi-Linear Regression

from sklearn import linear_model

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split( features, Y, test_size=0.2, random_state=10)

LinearReg = linear_model.LinearRegression()

LinearReg.fit(x_train, y_train)

predictions = LinearReg.predict(x_test)


LinearRegression_MAE = mean_absolute_error(y_test,predictions)
LinearRegression_MSE = mean_squared_error(y_test,predictions)
LinearRegression_R2 = r2_score(y_test,predictions)


# or


print("Mean absolute error: %.2f" % np.mean(np.absolute(predictions - y_test)))
print("Residual sum of squares (MSE): %.2f" % np.mean((predictions - y_test) ** 2))
print("R2-score: %.2f" % r2_score(y_test,predictions) )

print(LinearRegression_MAE, LinearRegression_MSE , LinearRegression_R2)

Mean absolute error: 0.26
Residual sum of squares (MSE): 0.12
R2-score: 0.43
0.25631835588062085 0.11572007059217618 0.4271346431527694


In [24]:
# KNN
from sklearn import preprocessing

# Preprocessing if needed
#x_train = preprocessing.StandardScaler().fit(x_train).transform(x_train.astype(float))
#x_test = preprocessing.StandardScaler().fit(x_test).transform(x_test.astype(float))



from sklearn.neighbors import KNeighborsClassifier

#Train Model and Predict  
KNN = KNeighborsClassifier(n_neighbors = 10).fit(x_train,y_train)
predictions = KNN.predict(x_test)


from sklearn import metrics



KNN_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
KNN_JaccardIndex = metrics.jaccard_score(y_test, predictions)
KNN_F1_Score = metrics.f1_score(y_test, predictions)

In [25]:
# DecisionTree

Tree = DecisionTreeClassifier(criterion="entropy", max_depth = 4).fit(x_train,y_train)
predictions = Tree.predict(x_test)


Tree_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
Tree_JaccardIndex = metrics.jaccard_score(y_test, predictions)
Tree_F1_Score = metrics.f1_score(y_test, predictions)

In [26]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix



x_train, x_test, y_train, y_test = train_test_split( features, Y, test_size=0.2, random_state=1)

LR = LogisticRegression(C=0.01, solver='liblinear').fit(x_train,y_train)

predictions = LR.predict(x_test) 

predict_proba = LR.predict_proba(x_test)

print(confusion_matrix(y_test, predictions, labels=[1,0])) # To print confusion matrix (confusion matrix: The diagram inside f1 score disciprtion in notes)

LR_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
LR_JaccardIndex = metrics.jaccard_score(y_test, predictions)
LR_F1_Score = metrics.f1_score(y_test, predictions)
LR_Log_Loss = metrics.log_loss(y_test, predictions)


[[106  76]
 [ 37 436]]


In [27]:
# SVM

from sklearn import svm
SVM = svm.SVC(kernel='rbf').fit(x_train, y_train) 
predictions= SVM.predict(x_test)



SVM_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
SVM_JaccardIndex = metrics.jaccard_score(y_test, predictions)
SVM_F1_Score = metrics.f1_score(y_test, predictions)


In [29]:
index_labels = ['KNN', 'Tree', 'LR', 'SVM']
d = {'Accuracy_Score': [KNN_Accuracy_Score, Tree_Accuracy_Score, LR_Accuracy_Score, SVM_Accuracy_Score], 'JaccardIndex': [KNN_JaccardIndex, Tree_JaccardIndex, LR_JaccardIndex, SVM_JaccardIndex], 'F1 Score': [KNN_F1_Score, Tree_F1_Score, LR_F1_Score, SVM_F1_Score], 'Log Loss': ['nan', 'nan', LR_Log_Loss, 'nan']}
df = pd.DataFrame(data=d, index=index_labels)
df

Unnamed: 0,Accuracy_Score,JaccardIndex,F1 Score,Log Loss
KNN,0.824427,0.439024,0.610169,
Tree,0.818321,0.480349,0.648968,
LR,0.827481,0.484018,0.652308,6.218218
SVM,0.722137,0.0,0.0,
