<a href="https://colab.research.google.com/github/mahsahadian/Cloud-TP2/blob/master/SVM_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook creates a SVM model to predict temperature.

## First, we start by importing the required libraries

In [0]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from pandas.plotting import scatter_matrix
from sklearn.metrics import accuracy_score

## We then load the data from the github url

In [0]:
url = 'https://raw.githubusercontent.com/tna-hub/Cloud-TP2/master/dataset.csv'
data = pd.read_csv(url)
data = data.set_index('No')

## After loading the data, we clean them from NaN values (empty cells) and unecessary columns (Removing 'PM2.5','PM10','SO2','NO2','CO','O3' columns and rows having cells with NaN values)

In [5]:
cols = ['PM2.5','PM10','SO2','NO2','CO','O3']
data = data.drop(cols,axis=1)
data = data.dropna()
data

Unnamed: 0_level_0,year,month,day,hour,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,2013,3,1,0,0.1,1021.1,-18.6,0.0,NW,4.4,Gucheng
2,2013,3,1,1,-0.3,1021.5,-19.0,0.0,NW,4.0,Gucheng
3,2013,3,1,2,-0.7,1021.5,-19.8,0.0,WNW,4.6,Gucheng
4,2013,3,1,3,-1.0,1022.7,-21.2,0.0,W,2.8,Gucheng
5,2013,3,1,4,-1.3,1023.0,-21.4,0.0,WNW,3.6,Gucheng
...,...,...,...,...,...,...,...,...,...,...,...
35060,2017,2,28,19,12.6,1011.9,-14.3,0.0,N,2.0,Wanliu
35061,2017,2,28,20,9.4,1012.3,-11.9,0.0,WSW,1.0,Wanliu
35062,2017,2,28,21,8.7,1012.8,-13.7,0.0,N,1.1,Wanliu
35063,2017,2,28,22,7.8,1012.9,-12.6,0.0,NNE,1.0,Wanliu


## As asked in the assignment, we categorize the temperature into verycold, cold, hot or veryhot. At the end of this cell, the TEMP column will be categorized and not having any numerical values

In [6]:
def categorizeTemp(column):
    temp_cat = column.apply(
    lambda x: 'verycold' if x <0 else (
        'cold' if 0<=x< 10 else (
            'moderate' if 10<=x<20 else (
                'hot' if 20<=x<30 else 'veryhot'))))
    return temp_cat

data.TEMP = categorizeTemp(data.TEMP)
data

Unnamed: 0_level_0,year,month,day,hour,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,2013,3,1,0,cold,1021.1,-18.6,0.0,NW,4.4,Gucheng
2,2013,3,1,1,verycold,1021.5,-19.0,0.0,NW,4.0,Gucheng
3,2013,3,1,2,verycold,1021.5,-19.8,0.0,WNW,4.6,Gucheng
4,2013,3,1,3,verycold,1022.7,-21.2,0.0,W,2.8,Gucheng
5,2013,3,1,4,verycold,1023.0,-21.4,0.0,WNW,3.6,Gucheng
...,...,...,...,...,...,...,...,...,...,...,...
35060,2017,2,28,19,moderate,1011.9,-14.3,0.0,N,2.0,Wanliu
35061,2017,2,28,20,cold,1012.3,-11.9,0.0,WSW,1.0,Wanliu
35062,2017,2,28,21,cold,1012.8,-13.7,0.0,N,1.1,Wanliu
35063,2017,2,28,22,cold,1012.9,-12.6,0.0,NNE,1.0,Wanliu


## These fonctions encode and reverse encode (unencode) alphabetical values to numerical ones using LabelEncoder

In [7]:
# Encoding String values into numerical values
def encode(data, columns):
  for column in columns:
    encs = {}
    encs[column] = LabelEncoder()
    data[column] = encs[column].fit_transform(data[column])
  return encs

# unEncoding back String values into numerical values
def unencode(enc, data, columns):
  for column in columns:
    data[column] = enc.inverse_transform(data[column])

# Encoding String values into numerical values (wd, station and TEMP)
encs = encode(data, ['wd', 'station', 'TEMP'])
print(data)


       year  month  day  hour  TEMP    PRES  DEWP  RAIN  wd  WSPM  station
No                                                                        
1      2013      3    1     0     0  1021.1 -18.6   0.0   7   4.4        5
2      2013      3    1     1     3  1021.5 -19.0   0.0   7   4.0        5
3      2013      3    1     2     3  1021.5 -19.8   0.0  14   4.6        5
4      2013      3    1     3     3  1022.7 -21.2   0.0  13   2.8        5
5      2013      3    1     4     3  1023.0 -21.4   0.0  14   3.6        5
...     ...    ...  ...   ...   ...     ...   ...   ...  ..   ...      ...
35060  2017      2   28    19     2  1011.9 -14.3   0.0   3   2.0       10
35061  2017      2   28    20     0  1012.3 -11.9   0.0  15   1.0       10
35062  2017      2   28    21     0  1012.8 -13.7   0.0   3   1.1       10
35063  2017      2   28    22     0  1012.9 -12.6   0.0   5   1.0       10
35064  2017      2   28    23     0  1012.6 -11.2   0.0   4   1.1       10

[383879 rows x 11 column

## Now that the data have been cleaned and categorized, we can split the data into training and testing set.
1. > Some models (neural network for example)  may have difficulty converging before the maximum number of iterations allowed if the data is not normalized. They may be sensitive to feature scaling, so it is highly recommended to scale the data. we also apply the same scaling to the test set for meaningful results. There are a lot of different methods for normalization of data, we  use the built-in StandardScaler for standardization. This is called Feature scaling.

In [9]:
from sklearn.preprocessing import StandardScaler

def split(data):
  #X will be used to train the model. it is a common practice to name it upper case x
  X = data.drop('TEMP', axis=1)
  #y is the data that shall be predicted by the model. It is a common practice in ML to name it lower case y
  y = data['TEMP']

  #Now split into training and testing sets (20% data will be for testing)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
  return X_train, X_test, y_train, y_test

#split the data into test and train
X_train, X_test, y_train, y_test = split(data)

# We then scale the data.
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X_train)

# Now apply the transformations to the training and testing set data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_train

array([[-0.55860223, -0.1518431 , -0.42243588, ...,  0.50274696,
         0.05306727,  0.94943386],
       [ 1.14217873,  0.13821675,  0.71407575, ...,  0.72369616,
         0.13312504, -0.94740384],
       [-1.40899271, -0.44190294,  0.94137808, ..., -0.60199902,
        -0.58739485,  1.5817131 ],
       ...,
       [ 1.14217873, -0.73196278, -0.87704053, ...,  0.06084857,
         0.6935294 , -0.94740384],
       [-1.40899271,  1.58851596,  0.94137808, ..., -1.26484661,
        -0.34722155, -0.63126422],
       [ 0.29178825, -1.31208247, -0.08148239, ..., -1.4857958 ,
        -0.74751038, -0.94740384]])

## This is the second model (RandomForestClassifier)




In [11]:
from sklearn.ensemble import RandomForestClassifier

def Model2():
  # Create the model and train the the training sets
  model = RandomForestClassifier(max_depth=None, min_samples_split=2, min_samples_leaf=1)
  model.fit(X_train, y_train)
  score = round(model.score(X_test, y_test, sample_weight=None)*100, 3)
  print("Accuracy of model 2 RandomForestClassifier is: {}%".format(score))

  #predicting the  Temperature
  y_pred = model.predict(X_test)
  df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
  unencode(encs['TEMP'], df, ['Actual', 'Predicted'])
  print(df)

Model2()


Accuracy of model 2 RandomForestClassifier is: 96.391%
         Actual Predicted
No                       
31987  moderate  moderate
18525  moderate  moderate
2848    veryhot   veryhot
1000   moderate  moderate
17388      cold      cold
...         ...       ...
22442       hot       hot
17796      cold      cold
33863  verycold  verycold
22280  moderate  moderate
8455   verycold  verycold

[76776 rows x 2 columns]


## This is the third model (Neural Network  Multi-Layer Perceptron Classifier model "MLPClassifier")


 

2. > Next we create the model, there are a lot of parameters we can choose to define and customize here, we will only define the hidden_layer_sizes. For this parameter we pass in a tuple consisting of the number of neurons we want at each layer, where the nth entry in the tuple represents the number of neurons in the nth layer of the MLP model. There are many ways to choose these numbers, but for simplicity we will choose 20 layers with the same number of neurons

In [12]:
from sklearn.neural_network import MLPClassifier

def Model3():
  # Create the model and train the the training sets
  model = MLPClassifier(hidden_layer_sizes=(20,20,20))
  model.fit(X_train, y_train)
  score = round(model.score(X_test, y_test, sample_weight=None)*100, 3)
  print("Accuracy of model 3 MLPClassifier is: {}%".format(score))

  #predicting the  Temperature
  y_pred = model.predict(X_test)
  df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
  unencode(encs['TEMP'], df, ['Actual', 'Predicted'])
  print(df)

Model3()




Accuracy of model 3 MLPClassifier is: 84.114%
         Actual Predicted
No                       
31987  moderate  moderate
18525  moderate  moderate
2848    veryhot   veryhot
1000   moderate  moderate
17388      cold      cold
...         ...       ...
22442       hot  moderate
17796      cold      cold
33863  verycold  verycold
22280  moderate  moderate
8455   verycold  verycold

[76776 rows x 2 columns]


## This is the 4th model (XGBoost)

> XGBoost is an implementation of gradient boosted decision trees designed for speed and performance that is dominative competitive machine learning.

In [0]:
from xgboost import XGBClassifier
def Model4(data):
  # Create the model and fit model no training data
  model = XGBClassifier()
  model.fit(X_train, y_train)

  # make predictions for test data
  y_pred = model.predict(X_test)
  predictions = [round(value) for value in y_pred]

  # evaluate predictions
  score = accuracy_score(y_test, predictions)
  print("Accuracy of model 4 XGBClassifier is: {}%".format(score))
  df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
  print(df)


## This is the 5th model (DecisionTreeClassifier)




In [14]:
from sklearn import tree
def Model5():
  # Create the model and train the the training sets
  model = tree.DecisionTreeClassifier()
  model.fit(X_train, y_train)
  score = round(model.score(X_test, y_test, sample_weight=None)*100, 3)
  print("Accuracy of model 5 DecisionTreeClassifier is: {}%".format(score))

  #predicting the  Temperature
  y_pred = model.predict(X_test)
  df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
  unencode(encs['TEMP'], df, ['Actual', 'Predicted'])
  print(df)

Model5()


Accuracy of model 5 DecisionTreeClassifier is: 94.462%
         Actual Predicted
No                       
31987  moderate  moderate
18525  moderate  moderate
2848    veryhot   veryhot
1000   moderate  moderate
17388      cold      cold
...         ...       ...
22442       hot       hot
17796      cold      cold
33863  verycold  verycold
22280  moderate  moderate
8455   verycold  verycold

[76776 rows x 2 columns]


## Now we train the first model and print the results (SVM)

In [0]:
def Model1():
  # Create the model and train the the training sets
  model = SVC(gamma=0.1, kernel='linear', C=1000)
  model.fit(X_train, y_train)
  score = round(model.score(X_test, y_test, sample_weight=None)*100, 3)
  print("Accuracy of model 1 SVM is: {}%".format(score))

  #predicting the  Temperature
  y_pred = model.predict(X_test)
  df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
  unencode(encs['TEMP'], df, ['Actual', 'Predicted'])
  print(df)

Model1()