In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import scipy as sp
from sklearn.tree import DecisionTreeClassifier  

from sklearn.model_selection import train_test_split

In [2]:
#Loading the data
data = pd.read_csv('weatherAUS.csv')

In [3]:
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


In [4]:
data.count().sort_values()

Sunshine          74377
Evaporation       81350
Cloud3pm          85099
Cloud9am          88536
Pressure9am      128179
Pressure3pm      128212
WindDir9am       132180
WindGustDir      132863
WindGustSpeed    132923
WindDir3pm       138415
Humidity3pm      138583
Temp3pm          139467
WindSpeed3pm     139563
Humidity9am      140419
RainToday        140787
Rainfall         140787
WindSpeed9am     140845
Temp9am          141289
MinTemp          141556
MaxTemp          141871
Date             142193
Location         142193
RISK_MM          142193
RainTomorrow     142193
dtype: int64

In [5]:
#Checking the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142193 entries, 0 to 142192
Data columns (total 24 columns):
Date             142193 non-null object
Location         142193 non-null object
MinTemp          141556 non-null float64
MaxTemp          141871 non-null float64
Rainfall         140787 non-null float64
Evaporation      81350 non-null float64
Sunshine         74377 non-null float64
WindGustDir      132863 non-null object
WindGustSpeed    132923 non-null float64
WindDir9am       132180 non-null object
WindDir3pm       138415 non-null object
WindSpeed9am     140845 non-null float64
WindSpeed3pm     139563 non-null float64
Humidity9am      140419 non-null float64
Humidity3pm      138583 non-null float64
Pressure9am      128179 non-null float64
Pressure3pm      128212 non-null float64
Cloud9am         88536 non-null float64
Cloud3pm         85099 non-null float64
Temp9am          141289 non-null float64
Temp3pm          139467 non-null float64
RainToday        140787 non-null obje

In [6]:
#Dropping the columns which are not needed
data_use = data.drop(columns = ['Sunshine','Location','Evaporation', 'Cloud3pm','Cloud9am','RISK_MM','Date'],axis=1)

In [7]:
data_use.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,13.4,22.9,0.6,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,16.9,21.8,No,No
1,7.4,25.1,0.0,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,17.2,24.3,No,No
2,12.9,25.7,0.0,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,21.0,23.2,No,No
3,9.2,28.0,0.0,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,18.1,26.5,No,No
4,17.5,32.3,1.0,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,17.8,29.7,No,No


In [8]:
data_use.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142193 entries, 0 to 142192
Data columns (total 17 columns):
MinTemp          141556 non-null float64
MaxTemp          141871 non-null float64
Rainfall         140787 non-null float64
WindGustDir      132863 non-null object
WindGustSpeed    132923 non-null float64
WindDir9am       132180 non-null object
WindDir3pm       138415 non-null object
WindSpeed9am     140845 non-null float64
WindSpeed3pm     139563 non-null float64
Humidity9am      140419 non-null float64
Humidity3pm      138583 non-null float64
Pressure9am      128179 non-null float64
Pressure3pm      128212 non-null float64
Temp9am          141289 non-null float64
Temp3pm          139467 non-null float64
RainToday        140787 non-null object
RainTomorrow     142193 non-null object
dtypes: float64(12), object(5)
memory usage: 18.4+ MB


In [9]:
data_use.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm
count,141556.0,141871.0,140787.0,132923.0,140845.0,139563.0,140419.0,138583.0,128179.0,128212.0,141289.0,139467.0
mean,12.1864,23.226784,2.349974,39.984292,14.001988,18.637576,68.84381,51.482606,1017.653758,1015.258204,16.987509,21.687235
std,6.403283,7.117618,8.465173,13.588801,8.893337,8.803345,19.051293,20.797772,7.105476,7.036677,6.492838,6.937594
min,-8.5,-4.8,0.0,6.0,0.0,0.0,0.0,0.0,980.5,977.1,-7.2,-5.4
25%,7.6,17.9,0.0,31.0,7.0,13.0,57.0,37.0,1012.9,1010.4,12.3,16.6
50%,12.0,22.6,0.0,39.0,13.0,19.0,70.0,52.0,1017.6,1015.2,16.7,21.1
75%,16.8,28.2,0.8,48.0,19.0,24.0,83.0,66.0,1022.4,1020.0,21.6,26.4
max,33.9,48.1,371.0,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,40.2,46.7


Filling missing values in a columns with average 

In [10]:
data_use['MinTemp'] = data_use['MinTemp'].fillna(data_use['MinTemp'].mean())


In [11]:
data_use['MaxTemp'] = data_use['MaxTemp'].fillna(data_use['MaxTemp'].mean())

In [12]:
data_use['Rainfall'] = data_use['Rainfall'].fillna(data_use['Rainfall'].mean())

In [13]:
data_use['WindGustSpeed'] = data_use['WindGustSpeed'].fillna(data_use['WindGustSpeed'].mean())

In [14]:
data_use['WindSpeed9am'] = data_use['WindSpeed9am'].fillna(data_use['WindSpeed9am'].mean())

In [15]:
data_use['WindSpeed3pm'] = data_use['WindSpeed3pm'].fillna(data_use['WindSpeed3pm'].mean())

In [16]:
data_use['Humidity9am'] = data_use['Humidity9am'].fillna(data_use['Humidity9am'].mean())

In [17]:
data_use['Humidity3pm'] = data_use['Humidity3pm'].fillna(data_use['Humidity3pm'].mean())

In [18]:
data_use['Pressure9am'] = data_use['Pressure9am'].fillna(data_use['Pressure9am'].mean())

In [19]:
data_use['Pressure3pm'] = data_use['Pressure3pm'].fillna(data_use['Pressure3pm'].mean())

In [20]:
data_use['Temp9am'] = data_use['Temp9am'].fillna(data_use['Temp9am'].mean())

In [21]:
data_use['Temp3pm'] = data_use['Temp3pm'].fillna(data_use['Temp3pm'].mean())

In [22]:
data_use.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm
count,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0
mean,12.1864,23.226784,2.349974,39.984292,14.001988,18.637576,68.84381,51.482606,1017.653758,1015.258204,16.987509,21.687235
std,6.388924,7.109554,8.423217,13.138385,8.851082,8.721551,18.932077,20.532065,6.746248,6.681788,6.472166,6.870771
min,-8.5,-4.8,0.0,6.0,0.0,0.0,0.0,0.0,980.5,977.1,-7.2,-5.4
25%,7.6,17.9,0.0,31.0,7.0,13.0,57.0,37.0,1013.5,1011.0,12.3,16.7
50%,12.0,22.7,0.0,39.0,13.0,18.637576,70.0,51.482606,1017.653758,1015.258204,16.8,21.3
75%,16.8,28.2,0.8,46.0,19.0,24.0,83.0,65.0,1021.8,1019.4,21.5,26.3
max,33.9,48.1,371.0,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,40.2,46.7


In [23]:
data_use.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,13.4,22.9,0.6,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,16.9,21.8,No,No
1,7.4,25.1,0.0,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,17.2,24.3,No,No
2,12.9,25.7,0.0,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,21.0,23.2,No,No
3,9.2,28.0,0.0,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,18.1,26.5,No,No
4,17.5,32.3,1.0,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,17.8,29.7,No,No


In [24]:
data_use['WindGustDir'] = data_use['WindGustDir'].fillna(data_use['WindGustDir'].value_counts().index[0])

In [25]:
data_use['WindDir9am'] = data_use['WindDir9am'].fillna(data_use['WindDir9am'].value_counts().index[0])

In [26]:
data_use['WindDir3pm'] = data_use['WindDir3pm'].fillna(data_use['WindDir3pm'].value_counts().index[0])

In [27]:
data_use['RainToday'] = data_use['RainToday'].fillna(data_use['RainToday'].value_counts().index[0])

In [28]:
data_use.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142193 entries, 0 to 142192
Data columns (total 17 columns):
MinTemp          142193 non-null float64
MaxTemp          142193 non-null float64
Rainfall         142193 non-null float64
WindGustDir      142193 non-null object
WindGustSpeed    142193 non-null float64
WindDir9am       142193 non-null object
WindDir3pm       142193 non-null object
WindSpeed9am     142193 non-null float64
WindSpeed3pm     142193 non-null float64
Humidity9am      142193 non-null float64
Humidity3pm      142193 non-null float64
Pressure9am      142193 non-null float64
Pressure3pm      142193 non-null float64
Temp9am          142193 non-null float64
Temp3pm          142193 non-null float64
RainToday        142193 non-null object
RainTomorrow     142193 non-null object
dtypes: float64(12), object(5)
memory usage: 18.4+ MB


In [29]:
data_use['RainTomorrow'] 

0          No
1          No
2          No
3          No
4          No
5          No
6          No
7          No
8         Yes
9          No
10        Yes
11        Yes
12        Yes
13         No
14         No
15        Yes
16        Yes
17         No
18         No
19         No
20         No
21         No
22         No
23         No
24         No
25         No
26         No
27        Yes
28         No
29         No
         ... 
142163     No
142164     No
142165     No
142166     No
142167     No
142168     No
142169     No
142170     No
142171     No
142172     No
142173     No
142174     No
142175     No
142176     No
142177     No
142178     No
142179     No
142180     No
142181     No
142182     No
142183     No
142184     No
142185     No
142186     No
142187     No
142188     No
142189     No
142190     No
142191     No
142192     No
Name: RainTomorrow, Length: 142193, dtype: object

In [30]:
cat_col = ['WindGustDir','WindDir9am','WindDir3pm']

In [31]:
#Using dummies for categorical values.
data_use = pd.get_dummies(data_use, columns=cat_col)

In [32]:
data_use.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,13.4,22.9,0.6,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,...,0,0,0,0,0,0,0,0,1,0
1,7.4,25.1,0.0,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,...,0,0,0,0,0,0,0,0,0,1
2,12.9,25.7,0.0,46.0,19.0,26.0,38.0,30.0,1007.6,1008.7,...,0,0,0,0,0,0,0,0,0,1
3,9.2,28.0,0.0,24.0,11.0,9.0,45.0,16.0,1017.6,1012.8,...,0,0,0,0,0,0,0,0,0,0
4,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,...,0,1,0,0,0,0,0,0,0,0


Replacing binary variable column into 0 or 1

In [33]:
data_use['RainToday'] = data_use['RainToday'].map({'No': 0, 'Yes': 1})

In [34]:
data_use['RainTomorrow'] = data_use['RainTomorrow'].map({'No': 0, 'Yes': 1})

In [35]:
data_use['RainTomorrow'] 

0         0
1         0
2         0
3         0
4         0
5         0
6         0
7         0
8         1
9         0
10        1
11        1
12        1
13        0
14        0
15        1
16        1
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        0
25        0
26        0
27        1
28        0
29        0
         ..
142163    0
142164    0
142165    0
142166    0
142167    0
142168    0
142169    0
142170    0
142171    0
142172    0
142173    0
142174    0
142175    0
142176    0
142177    0
142178    0
142179    0
142180    0
142181    0
142182    0
142183    0
142184    0
142185    0
142186    0
142187    0
142188    0
142189    0
142190    0
142191    0
142192    0
Name: RainTomorrow, Length: 142193, dtype: int64

In [36]:
data_use.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,13.4,22.9,0.6,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,...,0,0,0,0,0,0,0,0,1,0
1,7.4,25.1,0.0,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,...,0,0,0,0,0,0,0,0,0,1
2,12.9,25.7,0.0,46.0,19.0,26.0,38.0,30.0,1007.6,1008.7,...,0,0,0,0,0,0,0,0,0,1
3,9.2,28.0,0.0,24.0,11.0,9.0,45.0,16.0,1017.6,1012.8,...,0,0,0,0,0,0,0,0,0,0
4,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,...,0,1,0,0,0,0,0,0,0,0


In [37]:
data_use['RainTomorrow']

0         0
1         0
2         0
3         0
4         0
5         0
6         0
7         0
8         1
9         0
10        1
11        1
12        1
13        0
14        0
15        1
16        1
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        0
25        0
26        0
27        1
28        0
29        0
         ..
142163    0
142164    0
142165    0
142166    0
142167    0
142168    0
142169    0
142170    0
142171    0
142172    0
142173    0
142174    0
142175    0
142176    0
142177    0
142178    0
142179    0
142180    0
142181    0
142182    0
142183    0
142184    0
142185    0
142186    0
142187    0
142188    0
142189    0
142190    0
142191    0
142192    0
Name: RainTomorrow, Length: 142193, dtype: int64

In [38]:
y = data_use['RainTomorrow']


In [39]:
new_header = data_use.columns

In [40]:
features = data_use.drop('RainTomorrow',axis=1)

In [41]:
x_train, x_test, y_train, y_test = train_test_split(features,y, test_size = 0.2, random_state=0)

In [42]:
from imblearn.over_sampling import SMOTE

# Resample the minority class
sm = SMOTE(sampling_strategy='minority', random_state=7)

# Fit the model to generate the data.
oversampled_trainX, oversampled_trainY = sm.fit_sample(x_train, y_train)
oversampled_train = pd.concat([ pd.DataFrame(oversampled_trainX),pd.DataFrame(oversampled_trainY)], axis=1)
oversampled_train.columns = new_header

In [43]:
#Fitting random forest classifier.
clf = RandomForestClassifier(n_estimators=100)
clf.fit(oversampled_trainX,oversampled_trainY)
y_pred = clf.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8542846091634727


In [44]:
y_predict_prob = clf.predict_proba(features)

In [45]:
#Binning and attaching to original dataset.
bin_classes = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
bins = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
features['prob_1'] = pd.cut(y_predict_prob[:,0],bins,labels=bin_classes, include_lowest=True)

In [46]:
features.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW,prob_1
0,13.4,22.9,0.6,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,...,0,0,0,0,0,0,0,1,0,0.8
1,7.4,25.1,0.0,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,...,0,0,0,0,0,0,0,0,1,1.0
2,12.9,25.7,0.0,46.0,19.0,26.0,38.0,30.0,1007.6,1008.7,...,0,0,0,0,0,0,0,0,1,1.0
3,9.2,28.0,0.0,24.0,11.0,9.0,45.0,16.0,1017.6,1012.8,...,0,0,0,0,0,0,0,0,0,1.0
4,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,...,1,0,0,0,0,0,0,0,0,0.9


In [47]:
#Splitting the data
x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(features, y, test_size = 0.2, random_state=0)

In [48]:
#Applying decision tree classifier
clf_1 = DecisionTreeClassifier()
print('Before distillation..')
clf_1.fit(x_train, y_train)
y_dt_wkd = clf_1.predict(x_test)
print("Accuracy: ",metrics.accuracy_score(y_test, y_dt_wkd))
print('Precision:', metrics.precision_score(y_test, y_dt_wkd,average='weighted'))
print('Recall:', metrics.recall_score(y_test, y_dt_wkd,average='weighted'))
print('f1-score:', metrics.f1_score(y_test, y_dt_wkd,average='weighted'))

print('After distillation..')
clf_1.fit(x_train_1, y_train_1)
y_dt = clf_1.predict(x_test_1)
print("Accuracy: ",metrics.accuracy_score(y_test_1, y_dt))
print('Precision:', metrics.precision_score(y_test_1, y_dt,average='weighted'))
print('Recall:', metrics.recall_score(y_test_1, y_dt,average='weighted'))
print('f1-score:', metrics.f1_score(y_test_1, y_dt,average='weighted'))

Before distillation..
Accuracy:  0.7806533281760962
Precision: 0.7826530854068533
Recall: 0.7806533281760962
f1-score: 0.7816259265203758
After distillation..
Accuracy:  0.8543900981047153
Precision: 0.8466400945831429
Recall: 0.8543900981047153
f1-score: 0.8476604561923116


In [49]:
#Applying GuassianNB
from sklearn.naive_bayes import GaussianNB
nb=GaussianNB()
print('Before distillation..')
nb.fit(x_train, y_train)
y_nb_wkd = nb.predict(x_test)
print("Accuracy: ",metrics.accuracy_score(y_test, y_nb_wkd))
print('Precision:', metrics.precision_score(y_test, y_nb_wkd,average='weighted'))
print('Recall:', metrics.recall_score(y_test, y_nb_wkd,average='weighted'))
print('f1-score:', metrics.f1_score(y_test, y_nb_wkd,average='weighted'))

print('After distillation..')
nb.fit(x_train_1, y_train_1)
y_nb = nb.predict(x_test_1)
print("Accuracy: ",metrics.accuracy_score(y_test_1, y_nb))
print('Precision:', metrics.precision_score(y_test_1, y_nb,average='weighted'))
print('Recall:', metrics.recall_score(y_test_1, y_nb,average='weighted'))
print('f1-score:', metrics.f1_score(y_test_1, y_nb,average='weighted'))

Before distillation..
Accuracy:  0.7178170821758852
Precision: 0.7652981284544068
Recall: 0.7178170821758852
f1-score: 0.7342715378375787
After distillation..
Accuracy:  0.8344526882098526
Precision: 0.8431907667522978
Recall: 0.8344526882098526
f1-score: 0.837987796529191


In [50]:
#Applying SGDC classifier
from sklearn.linear_model import SGDClassifier
SGDclf = SGDClassifier(loss='modified_huber',shuffle=True)
print('Before distillation..')
SGDclf.fit(x_train,y_train)
SGDpred_wkd = SGDclf.predict(x_test)
print("Accuracy: ",metrics.accuracy_score(y_test, SGDpred_wkd))
print('Precision:', metrics.precision_score(y_test, SGDpred_wkd,average='weighted'))
print('Recall:', metrics.recall_score(y_test, SGDpred_wkd,average='weighted'))
print('f1-score:', metrics.f1_score(y_test, SGDpred_wkd,average='weighted'))

print('After distillation..')
SGDclf.fit(x_train_1,y_train_1)
SGDpred = SGDclf.predict(x_test_1)
print("Accuracy: ",metrics.accuracy_score(y_test_1, SGDpred))
print('Precision:', metrics.precision_score(y_test_1, SGDpred,average='weighted'))
print('Recall:', metrics.recall_score(y_test_1, SGDpred,average='weighted'))
print('f1-score:', metrics.f1_score(y_test_1, SGDpred,average='weighted'))

Before distillation..




Accuracy:  0.8109286543127395
Precision: 0.8238116023461399
Recall: 0.8109286543127395
f1-score: 0.8160438660653192
After distillation..




Accuracy:  0.8357185555047646
Precision: 0.8284594481010228
Recall: 0.8357185555047646
f1-score: 0.812303289260643


In [51]:
#Applying KNN algorithm
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 10) # n_neighbors = k
print('Before distillation..')
knn.fit(x_train,y_train)
prediction_wkd = knn.predict(x_test)
print("Accuracy: ",metrics.accuracy_score(y_test, prediction_wkd))
print('Precision:', metrics.precision_score(y_test, prediction_wkd,average='weighted'))
print('Recall:', metrics.recall_score(y_test, prediction_wkd,average='weighted'))
print('f1-score:', metrics.f1_score(y_test, prediction_wkd,average='weighted'))

print('After distillation..')
knn.fit(x_train_1,y_train_1)
prediction = knn.predict(x_test_1)
print("Accuracy:",metrics.accuracy_score(y_test_1, prediction))
print('Precision:', metrics.precision_score(y_test_1, prediction,average='weighted'))
print('Recall:', metrics.recall_score(y_test_1, prediction,average='weighted'))
print('f1-score:', metrics.f1_score(y_test_1, prediction,average='weighted'))

Before distillation..
Accuracy:  0.841063328527726
Precision: 0.8309830760502863
Recall: 0.841063328527726
f1-score: 0.8251324777858265
After distillation..
Accuracy: 0.8412039804493829
Precision: 0.8311511790486314
Recall: 0.8412039804493829
f1-score: 0.8253201110261494
