# **Pattern Recognition and Machine Learning**
> Week 6 Tutorial

## **Tutorial 1 - Text Classification**

## Classification of news articles into different topics using Naive Bayes
### 1. Retrieve data

In [1]:
from sklearn.datasets import fetch_20newsgroups

news_groups = fetch_20newsgroups(subset='all')

### 2. Explore dataset

In [30]:
news_groups.data[1]

'From: mblawson@midway.ecn.uoknor.edu (Matthew B Lawson)\nSubject: Which high-performance VLB video card?\nSummary: Seek recommendations for VLB video card\nNntp-Posting-Host: midway.ecn.uoknor.edu\nOrganization: Engineering Computer Network, University of Oklahoma, Norman, OK, USA\nKeywords: orchid, stealth, vlb\nLines: 21\n\n  My brother is in the market for a high-performance video card that supports\nVESA local bus with 1-2MB RAM.  Does anyone have suggestions/ideas on:\n\n  - Diamond Stealth Pro Local Bus\n\n  - Orchid Farenheit 1280\n\n  - ATI Graphics Ultra Pro\n\n  - Any other high-performance VLB card\n\n\nPlease post or email.  Thank you!\n\n  - Matt\n\n-- \n    |  Matthew B. Lawson <------------> (mblawson@essex.ecn.uoknor.edu)  |   \n  --+-- "Now I, Nebuchadnezzar, praise and exalt and glorify the King  --+-- \n    |   of heaven, because everything he does is right and all his ways  |   \n    |   are just." - Nebuchadnezzar, king of Babylon, 562 B.C.           |   \n'

In [33]:
list(news_groups.target_names)

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [35]:
news_groups.filenames.shape

(18846,)

In [36]:
news_groups.target.shape

(18846,)

### 3. Calculate words frequency
Use CountVectorizer to calculate a bag of words and their frequencies.

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

An example first:

In [38]:
document = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

In [39]:
cv.fit_transform(document)
cv.vocabulary_

{'this': 8,
 'is': 3,
 'the': 6,
 'first': 2,
 'document': 1,
 'second': 5,
 'and': 0,
 'third': 7,
 'one': 4}

In [40]:
cv.fit_transform(document).toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

Output a bag of words and their frequency.

In [41]:
word_list = cv.get_feature_names()
count_list = cv.fit_transform(document).toarray().sum(axis=0)
print(dict(zip(word_list,count_list)))

{'and': 1, 'document': 4, 'first': 2, 'is': 4, 'one': 1, 'second': 1, 'the': 4, 'third': 1, 'this': 4}


### 4. Extract features

In [42]:
features = cv.fit_transform(news_groups.data)

In [43]:
features.shape

(18846, 173762)

In [44]:
features.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### 5. Split into train/test sets

In [45]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features,
                                                    news_groups.target,
                                                    train_size=0.8,
                                                    random_state=111)

In [46]:
X_train.shape

(15076, 173762)

In [47]:
X_test.shape

(3770, 173762)

In [48]:
y_train.shape

(15076,)

In [49]:
y_test.shape

(3770,)

### 6. Build multinomil Naive Bayes model and fit

In [50]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB(alpha=1)
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

In [51]:
print(y_test)
print(y_pred)

[17  9  2 ...  7 14 10]
[18  9  5 ...  7 14 10]


### 7. Evaluate the model

In [52]:
from sklearn import metrics
from sklearn.metrics import classification_report

In [53]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall:",metrics.recall_score(y_test, y_pred, average='weighted'))
print("F1-score:",metrics.f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.8445623342175066
Precision: 0.8576065380444653
Recall: 0.8445623342175066
F1-score: 0.829158924495538


In [54]:
report = classification_report(y_test, y_pred,
                               target_names=news_groups.target_names)
print(report)

                          precision    recall  f1-score   support

             alt.atheism       0.87      0.86      0.86       171
           comp.graphics       0.69      0.87      0.77       205
 comp.os.ms-windows.misc       0.92      0.11      0.20       199
comp.sys.ibm.pc.hardware       0.70      0.83      0.76       214
   comp.sys.mac.hardware       0.86      0.84      0.85       179
          comp.windows.x       0.67      0.90      0.77       182
            misc.forsale       0.91      0.73      0.81       167
               rec.autos       0.91      0.88      0.90       185
         rec.motorcycles       0.96      0.93      0.94       209
      rec.sport.baseball       0.97      0.94      0.95       216
        rec.sport.hockey       0.96      0.98      0.97       207
               sci.crypt       0.85      0.97      0.90       211
         sci.electronics       0.86      0.84      0.85       202
                 sci.med       0.92      0.92      0.92       193
         

Confusion matrx

In [55]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[147,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,
          1,   1,  11,   0,   3,   3,   4],
       [  0, 179,   0,   6,   0,   9,   1,   1,   0,   0,   0,   2,   1,
          1,   1,   1,   0,   0,   3,   0],
       [  0,  35,  22,  49,  10,  58,   1,   0,   0,   2,   0,   3,   7,
          0,   3,   1,   1,   1,   6,   0],
       [  1,  10,   0, 178,   4,   5,   4,   0,   0,   0,   0,   6,   2,
          0,   2,   0,   0,   0,   2,   0],
       [  1,   4,   1,   9, 151,   2,   0,   0,   0,   0,   1,   1,   7,
          1,   0,   0,   0,   0,   1,   0],
       [  0,  12,   0,   2,   0, 164,   0,   0,   0,   1,   0,   2,   1,
          0,   0,   0,   0,   0,   0,   0],
       [  0,   1,   0,   9,   7,   0, 122,   8,   2,   0,   2,   2,   4,
          0,   2,   1,   4,   1,   2,   0],
       [  0,   1,   0,   1,   0,   1,   3, 163,   3,   1,   0,   0,   1,
          0,   1,   1,   3,   3,   3,   0],
       [  0,   0,   0,   0,   0,   0,   3,   2, 194,   0,   0,  

### 8. Find the optimal alpha using GridSearchCV

In [56]:
from sklearn.model_selection import GridSearchCV

nb = MultinomialNB()

Parameters setting for alpha: 10, 1, 0.1, 0.001, 0.0001

In [57]:
params = {'alpha': [10, 1, 1e-1, 1e-2, 1e-3]}

Run the Grid Search and fit the data

In [58]:
# 10-fold
#grs = GridSearchCV(nb, param_grid=params, cv = 10)

# 5-fold default
grs = GridSearchCV(nb, param_grid=params)

grs.fit(X_train, y_train)

GridSearchCV(estimator=MultinomialNB(),
             param_grid={'alpha': [10, 1, 0.1, 0.01, 0.001]})

The optimal value

In [59]:
print("Best Hyper Parameters:",grs.best_params_)

Best Hyper Parameters: {'alpha': 0.001}


### 9. Evaluate the result

In [60]:
y_pred=grs.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall:",metrics.recall_score(y_test, y_pred, average='weighted'))
print("F1-score:",metrics.f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.8949602122015915
Precision: 0.8993482849616136
Recall: 0.8949602122015915
F1-score: 0.8919135874673622


## **Tutorial 2 - Ransomware Detection using K-NN**
### 1. Retrieve data

In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split

bitcoin_heist = pd.read_csv("data/BitcoinHeistData.csv")

### 2. Explore data

In [62]:
bitcoin_heist.shape

(2916697, 10)

In [63]:
bitcoin_heist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2916697 entries, 0 to 2916696
Data columns (total 10 columns):
 #   Column     Dtype  
---  ------     -----  
 0   address    object 
 1   year       int64  
 2   day        int64  
 3   length     int64  
 4   weight     float64
 5   count      int64  
 6   looped     int64  
 7   neighbors  int64  
 8   income     float64
 9   label      object 
dtypes: float64(2), int64(6), object(2)
memory usage: 222.5+ MB


In [64]:
bitcoin_heist.head(3)

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
0,111K8kZAEnJg245r2cM6y9zgJGHZtJPy6,2017,11,18,0.008333,1,0,2,100050000.0,princetonCerber
1,1123pJv8jzeFQaCV4w644pzQJzVWay2zcA,2016,132,44,0.000244,1,0,1,100000000.0,princetonLocky
2,112536im7hy6wtKbpH1qYDWtTyMRAcA2p7,2016,246,0,1.0,1,0,2,200000000.0,princetonCerber


In [65]:
bitcoin_heist.describe()

Unnamed: 0,year,day,length,weight,count,looped,neighbors,income
count,2916697.0,2916697.0,2916697.0,2916697.0,2916697.0,2916697.0,2916697.0,2916697.0
mean,2014.475,181.4572,45.00859,0.5455192,721.6446,238.5067,2.206516,4464889000.0
std,2.257398,104.0118,58.98236,3.674255,1689.676,966.3217,17.91877,162686000000.0
min,2011.0,1.0,0.0,3.606469e-94,1.0,0.0,1.0,30000000.0
25%,2013.0,92.0,2.0,0.02148438,1.0,0.0,1.0,74285590.0
50%,2014.0,181.0,8.0,0.25,1.0,0.0,2.0,199998500.0
75%,2016.0,271.0,108.0,0.8819482,56.0,0.0,2.0,994000000.0
max,2018.0,365.0,144.0,1943.749,14497.0,14496.0,12920.0,49964400000000.0


In [66]:
bitcoin_heist.describe(include="O")

Unnamed: 0,address,label
count,2916697,2916697
unique,2631095,29
top,1LXrSb67EaH1LGc6d6kWHq8rgv4ZBQAcpU,white
freq,420,2875284


In [67]:
bitcoin_heist.dtypes

address       object
year           int64
day            int64
length         int64
weight       float64
count          int64
looped         int64
neighbors      int64
income       float64
label         object
dtype: object

### 3. Convert categorical values to numerical values

In [68]:
bitcoin_heist['labels'] = [0 if x == 'white' else 1 for x in bitcoin_heist['label']]

bitcoin_heist['labels'].value_counts()

0    2875284
1      41413
Name: labels, dtype: int64

### 4. Extract features

In [69]:
X = bitcoin_heist.loc[0:200000, ['year','day', 'length', 'weight', 'count',
                                 'looped', 'neighbors', 'income']]
y = bitcoin_heist.loc[0:200000, 'labels']

### 5. Split data into train/test sets

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=111)

### 6. Build and fit K-NN model

In [71]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

### 7. Evaluate the model

In [72]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [73]:
confusion_matrix(y_test, y_pred)

array([[31232,   517],
       [ 1633,  6619]], dtype=int64)

In [74]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97     31749
           1       0.93      0.80      0.86      8252

    accuracy                           0.95     40001
   macro avg       0.94      0.89      0.91     40001
weighted avg       0.95      0.95      0.94     40001



In [75]:
f1_score(y_test, y_pred, average='weighted')

0.944766476726059

### 8. Tune parameter using GridSearchCV

In [76]:
from sklearn.model_selection import GridSearchCV

knn = KNeighborsClassifier()
params = {'n_neighbors': range(1,10)}

# 10-fold
#grs = GridSearchCV(knn, param_grid=params, cv = 10)

# 5-fold default
grs = GridSearchCV(knn, param_grid=params)
grs.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 10)})

In [81]:
print("Best Hyper Parameters:",grs.best_params_)

Best Hyper Parameters: {'n_neighbors': 3}


In [78]:
from sklearn import metrics

y_pred=grs.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall:",metrics.recall_score(y_test, y_pred, average='weighted'))
print("F1-score:",metrics.f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.9462513437164071
Precision: 0.9456163155971592
Recall: 0.9462513437164071
F1-score: 0.944766476726059


Put more data into training.

In [80]:
X = bitcoin_heist[['year','day', 'length', 'weight', 'count',
                  'looped', 'neighbors', 'income']]
y = bitcoin_heist['labels']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=111)

knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[573809,   1180],
       [  6420,   1931]], dtype=int64)

In [82]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall:",metrics.recall_score(y_test, y_pred, average='weighted'))
print("F1-score:",metrics.f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.986971577467686
Precision: 0.9836638158909031
Recall: 0.986971577467686
F1-score: 0.9840230704872608
