<a href="https://colab.research.google.com/github/tna-hub/Cloud-TP2/blob/master/SVM_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook creates a SVM model to predict temperature.

## First, we start by importing the required libraries

In [0]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from google.colab import drive
from pandas.plotting import scatter_matrix

## We then load the data from the github url

In [0]:
url = 'https://raw.githubusercontent.com/tna-hub/Cloud-TP2/master/dataset.csv'
data = pd.read_csv(url)
data = data.set_index('No')

## After loading the data, we clean them from NaN values (empty cells) and unecessary columns (Removing 'PM2.5','PM10','SO2','NO2','CO','O3' columns and rows having cells with NaN values)

In [23]:
cols = ['PM2.5','PM10','SO2','NO2','CO','O3']
data = data.drop(cols,axis=1)
data = data.dropna()
data

Unnamed: 0_level_0,year,month,day,hour,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,2013,3,1,0,0.1,1021.1,-18.6,0.0,NW,4.4,Gucheng
2,2013,3,1,1,-0.3,1021.5,-19.0,0.0,NW,4.0,Gucheng
3,2013,3,1,2,-0.7,1021.5,-19.8,0.0,WNW,4.6,Gucheng
4,2013,3,1,3,-1.0,1022.7,-21.2,0.0,W,2.8,Gucheng
5,2013,3,1,4,-1.3,1023.0,-21.4,0.0,WNW,3.6,Gucheng
...,...,...,...,...,...,...,...,...,...,...,...
35060,2017,2,28,19,12.6,1011.9,-14.3,0.0,N,2.0,Wanliu
35061,2017,2,28,20,9.4,1012.3,-11.9,0.0,WSW,1.0,Wanliu
35062,2017,2,28,21,8.7,1012.8,-13.7,0.0,N,1.1,Wanliu
35063,2017,2,28,22,7.8,1012.9,-12.6,0.0,NNE,1.0,Wanliu


## As asked in the assignment, we categorize the temperature into verycold, cold, hot or veryhot. At the end of this cell, the TEMP column will be categorized and not having any numerical values

In [21]:
def categorizeTemp(column):
    temp_cat = column.apply(
    lambda x: 'verycold' if x <0 else (
        'cold' if 0<=x< 10 else (
            'moderate' if 10<=x<20 else (
                'hot' if 20<=x<30 else 'veryhot'))))
    return temp_cat

Unnamed: 0_level_0,year,month,day,hour,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,2013,3,1,0,cold,1021.1,-18.6,0.0,NW,4.4,Gucheng
2,2013,3,1,1,verycold,1021.5,-19.0,0.0,NW,4.0,Gucheng
3,2013,3,1,2,verycold,1021.5,-19.8,0.0,WNW,4.6,Gucheng
4,2013,3,1,3,verycold,1022.7,-21.2,0.0,W,2.8,Gucheng
5,2013,3,1,4,verycold,1023.0,-21.4,0.0,WNW,3.6,Gucheng
...,...,...,...,...,...,...,...,...,...,...,...
35060,2017,2,28,19,moderate,1011.9,-14.3,0.0,N,2.0,Wanliu
35061,2017,2,28,20,cold,1012.3,-11.9,0.0,WSW,1.0,Wanliu
35062,2017,2,28,21,cold,1012.8,-13.7,0.0,N,1.1,Wanliu
35063,2017,2,28,22,cold,1012.9,-12.6,0.0,NNE,1.0,Wanliu


## Now we need to convert alphabetical values to numerical ones using LabelEncoder (rows: 'wd', 'station', and 'temp')

In [24]:
# Encoding String values into numerical values (wd and station)
le_wd = LabelEncoder()
le_station = LabelEncoder()
data['wd'] = le_wd.fit_transform(data.wd)
data['station'] = le_station.fit_transform(data.station)
data

Unnamed: 0_level_0,year,month,day,hour,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,2013,3,1,0,0.1,1021.1,-18.6,0.0,7,4.4,5
2,2013,3,1,1,-0.3,1021.5,-19.0,0.0,7,4.0,5
3,2013,3,1,2,-0.7,1021.5,-19.8,0.0,14,4.6,5
4,2013,3,1,3,-1.0,1022.7,-21.2,0.0,13,2.8,5
5,2013,3,1,4,-1.3,1023.0,-21.4,0.0,14,3.6,5
...,...,...,...,...,...,...,...,...,...,...,...
35060,2017,2,28,19,12.6,1011.9,-14.3,0.0,3,2.0,10
35061,2017,2,28,20,9.4,1012.3,-11.9,0.0,15,1.0,10
35062,2017,2,28,21,8.7,1012.8,-13.7,0.0,3,1.1,10
35063,2017,2,28,22,7.8,1012.9,-12.6,0.0,5,1.0,10


## Now that the data have been cleaned and categorized, we can split the data into training and testing set.

In [0]:
def split(data):
  #X will be used to train the model. it is a common practice to name it upper case x
  X = data.drop('TEMP', axis=1)
  #y is the data that shall be predicted by the model. It is a common practice in ML to name it lower case y
  y = data['TEMP']

  #Now split into training and testing sets (20% data will be for testing)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
  return X_train, X_test, y_train, y_test


## This is the second model (LinearRegression)

In [27]:
def Model2(data):
  #split the data into test and train
  X_train, X_test, y_train, y_test = split(data)
  regr = linear_model.LinearRegression()
  # Train the model using the training sets
  regr.fit(X_train,y_train)
  print("The score of model 2 is: {}".format(regr.score(X_test, y_test, sample_weight=None)))
  #predicting the  Temperature
  y_pred = regr.predict(X_test)
  df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
  print(df)

Model2(data)


The score of model 2 is: 0.8241728303200344
       Actual  Predicted
No                      
31987  13.325  11.931042
18525  13.400  21.274206
2848   34.300  28.417188
1000   16.900   9.827328
17388   1.900   0.090970
...       ...        ...
22442  21.600  19.993101
17796   6.300   3.599878
33863  -3.800   5.437142
22280  15.400  13.815745
8455   -4.200   1.460771

[76776 rows x 2 columns]


## Now we train the first model and print the results (SVM)

In [0]:
def Model1(data):
    #First we categorize then encode TEMP
    data['TEMP'] = categorizeTemp(data['TEMP'])
    le_TEMP = LabelEncoder()
    data['TEMP'] = le_TEMP.fit_transform(data.TEMP)
    print(data)
    #split the data into test and train
    X_train, X_test, y_train, y_test = split(data)
    #Parameter tunning....
    kernels = ['linear','rbf','poly']
    gammas = [1e-3, 1e-4]
    Cs = [1, 10, 100, 1000]
    highest_score = 0
    for kernel in kernels:
        for c in Cs:
            for gamma in gammas:
                clf = SVC(gamma=gamma, kernel=kernel, C=c).fit(X_train, y_train)
                score = clf.score(X_test, y_test, sample_weight=None)
                if score > highest_score:
                    highest_score = score
                    #print(score)
                print("kernel:{} - Gamma:{} - C:{} - Train_size:{}, test_size:{}, score:{}".format(kernel, c, gamma ,len(X_train), len(X_test), score))
Model1(data)

       year  month  day  hour  TEMP    PRES  DEWP  RAIN  wd  WSPM  station
No                                                                        
1      2013      3    1     0     0  1021.1 -18.6   0.0   7   4.4        5
2      2013      3    1     1     3  1021.5 -19.0   0.0   7   4.0        5
3      2013      3    1     2     3  1021.5 -19.8   0.0  14   4.6        5
4      2013      3    1     3     3  1022.7 -21.2   0.0  13   2.8        5
5      2013      3    1     4     3  1023.0 -21.4   0.0  14   3.6        5
...     ...    ...  ...   ...   ...     ...   ...   ...  ..   ...      ...
35060  2017      2   28    19     2  1011.9 -14.3   0.0   3   2.0       10
35061  2017      2   28    20     0  1012.3 -11.9   0.0  15   1.0       10
35062  2017      2   28    21     0  1012.8 -13.7   0.0   3   1.1       10
35063  2017      2   28    22     0  1012.9 -12.6   0.0   5   1.0       10
35064  2017      2   28    23     0  1012.6 -11.2   0.0   4   1.1       10

[383879 rows x 11 column