# **Pattern Recognition and Machine Learning**
> Week 6 Tutorial

## **Tutorial 1 - Text Classification**

## Classification of news articles into different topics using Naive Bayes
### 1. Retrieve data

In [None]:
from sklearn.datasets import fetch_20newsgroups

news_groups = fetch_20newsgroups(subset='all')

### 2. Explore dataset

In [None]:
news_groups.data[1]

In [None]:
list(news_groups.target_names)

In [None]:
news_groups.filenames.shape

In [None]:
news_groups.target.shape

### 3. Calculate words frequency
Use CountVectorizer to calculate a bag of words and their frequencies.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

An example first:

In [None]:
document = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

In [None]:
cv.fit_transform(document)
cv.vocabulary_

In [None]:
cv.fit_transform(document).toarray()

Output a bag of words and their frequency.

In [None]:
word_list = cv.get_feature_names()
count_list = cv.fit_transform(document).toarray().sum(axis=0)
print(dict(zip(word_list,count_list)))

### 4. Extract features

In [None]:
features = cv.fit_transform(news_groups.data)

In [None]:
features.shape

In [None]:
features.toarray()

### 5. Split into train/test sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features,
                                                    news_groups.target,
                                                    train_size=0.8,
                                                    random_state=11)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

### 6. Build multinomil Naive Bayes model and fit

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB(alpha=1)
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

In [None]:
print(y_test)
print(y_pred)

### 7. Evaluate the model

In [None]:
from sklearn import metrics
from sklearn.metrics import classification_report

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall:",metrics.recall_score(y_test, y_pred, average='weighted'))
print("F1-score:",metrics.f1_score(y_test, y_pred, average='weighted'))

In [None]:
report = classification_report(y_test, y_pred,
                               target_names=news_groups.target_names)
print(report)

Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

### 8. Find the optimal alpha using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

nb = MultinomialNB()

Parameters setting for alpha: 10, 1, 0.1, 0.001, 0.0001

In [None]:
params = {'alpha': [10, 1, 1e-1, 1e-2, 1e-3]}

Run the Grid Search and fit the data

In [None]:
# 10-fold
#grs = GridSearchCV(nb, param_grid=params, cv = 10)

# 5-fold default
grs = GridSearchCV(nb, param_grid=params)

grs.fit(X_train, y_train)

The optimal value

In [None]:
print("Best Hyper Parameters:",grs.best_params_)

### 9. Evaluate the result

In [None]:
y_pred=grs.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall:",metrics.recall_score(y_test, y_pred, average='weighted'))
print("F1-score:",metrics.f1_score(y_test, y_pred, average='weighted'))

## **Tutorial 2 - Ransomware Detection using K-NN**
### 1. Retrieve data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

bitcoin_heist = pd.read_csv("data/BitcoinHeistData.csv")

### 2. Explore data

In [None]:
bitcoin_heist.shape

In [None]:
bitcoin_heist.info()

In [None]:
bitcoin_heist.head(3)

In [None]:
bitcoin_heist.describe()

In [None]:
bitcoin_heist.describe(include="O")

In [None]:
bitcoin_heist.dtypes

### 3. Convert categorical values to numerical values

In [None]:
bitcoin_heist['labels'] = [0 if x == 'white' else 1 for x in bitcoin_heist['label']]

bitcoin_heist['labels'].value_counts()

### 4. Extract features

In [None]:
X = bitcoin_heist.loc[0:200000, ['year','day', 'length', 'weight', 'count',
                                 'looped', 'neighbors', 'income']]
y = bitcoin_heist.loc[0:200000, 'labels']

### 5. Split data into train/test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=11)

### 6. Build and fit K-NN model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

### 7. Evaluate the model

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
f1_score(y_test, y_pred, average='weighted')

### 8. Tune parameter using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

knn = KNeighborsClassifier()
params = {'n_neighbors': range(1,10)}

# 10-fold
#grs = GridSearchCV(knn, param_grid=params, cv = 10)

# 5-fold default
grs = GridSearchCV(knn, param_grid=params)
grs.fit(X_train, y_train)

In [None]:
print("Best Hyper Parameters:",grs.best_params_)

In [None]:
from sklearn import metrics

y_pred=grs.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall:",metrics.recall_score(y_test, y_pred, average='weighted'))
print("F1-score:",metrics.f1_score(y_test, y_pred, average='weighted'))

Put more data into training.

In [None]:
X = bitcoin_heist[['year','day', 'length', 'weight', 'count',
                  'looped', 'neighbors', 'income']]
y = bitcoin_heist['labels']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=11)

knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

confusion_matrix(y_test, y_pred)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall:",metrics.recall_score(y_test, y_pred, average='weighted'))
print("F1-score:",metrics.f1_score(y_test, y_pred, average='weighted'))