# Project 1

## Question 1

Getting familiar with the dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import random
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import NMF
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from tempfile import mkdtemp
from joblib import Memory

warnings.filterwarnings('ignore')

np.random.seed(42)
random.seed(42)

data = pd.read_csv('./Project1-Classification.csv')
print('number of rows:',data.shape[0])
print('number of columns:',data.shape[1])
data.head(3)

Plot graphs

In [None]:
text_count = []
for i in range(data.shape[0]):
    text = data.iloc[i].at['full_text']
    count = sum(1 for char in text if char.isalnum())
    text_count.append(count)

plt.hist(text_count,bins=50)
plt.xlabel('Number of characters in text',fontsize=14)
plt.ylabel('Frequency',fontsize=14)
plt.xlim((0,15000))
plt.show()

In [None]:
plt.figure(figsize=(14,6))
res = dict(Counter(data['leaf_label']))
x = list(res.keys())
y = list(res.values())
plt.bar(x,y)
for a,b in zip(x,y):
    plt.text(a,b,b,ha='center',va='bottom')
plt.xlabel('Classes of leaf_label',fontsize=14)
plt.ylabel('Number of samples',fontsize=14)
plt.show()

In [None]:
res = dict(Counter(data['root_label']))
x = list(res.keys())
y = list(res.values())
plt.bar(x,y)
for a,b in zip(x,y):
    plt.text(a,b,b,ha='center',va='bottom')
plt.xlabel('Classes of root_label',fontsize=14)
plt.ylabel('Number of samples',fontsize=14)
plt.show()

## Quesion 2

Splitting the entire dataset into training and testing data

In [None]:
#split train-test data
train, test = train_test_split(data[["full_text","root_label"]], test_size=0.2, random_state=42)
print('train sample number: ',len(train))
print('test sample number: ',len(test))

Clean data

In [None]:
import re
def clean(text):
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    if texter == "":
        texter = ""
    return texter

nltk.download("all")

In [None]:
# clean data

train.applymap(clean)
test.applymap(clean)

## Question 3

Lemmatizing and produce TF-IDF matrix

In [None]:
def transform_tag(tag):
    tag_dict = {'NN':'n', 'JJ':'a','VB':'v', 'RB':'r'}
    if tag in tag_dict.keys():
        return tag_dict[tag]
    else:
        return 'n'

def lemmatizer(text):
    lemma = WordNetLemmatizer()
    words_list = []
    for word,tag in pos_tag(word_tokenize(text)):
        words_list.append(lemma.lemmatize(word=word.lower(),pos=transform_tag(tag)))
    return words_list


# lemmatize text
for i in range(len(train)):
    text = train.iloc[i].at['full_text']
    lemma_words = lemmatizer(text)
    lemma_words = [i for i in lemma_words if i not in string.punctuation] 
    lemma_words = [i for i in lemma_words if not i.isdigit()]
    train.iloc[i].at['full_text'] = ' '.join(lemma_words)



tfidf = TfidfTransformer()

vectorizer = CountVectorizer(min_df=3, stop_words='english')
X_train_counts = vectorizer.fit_transform(train['full_text'])

X_train_tfidf = tfidf.fit_transform(X_train_counts) # making the tfidf train matrix

print(X_train_tfidf.shape)

In [None]:
for i in range(len(test)):
    text = test.iloc[i].at['full_text']
    lemma_words = lemmatizer(text)
    lemma_words = [i for i in lemma_words if i not in string.punctuation] 
    lemma_words = [i for i in lemma_words if not i.isdigit()]
    test.iloc[i].at['full_text'] = ' '.join(lemma_words) 

X_test_counts = vectorizer.transform(test['full_text'])
X_test_tfidf = tfidf.transform(X_test_counts)
X_test_tfidf.shape

## Question 4

LSI & NMF

In [None]:
#LSI - explained_variance_ratio plot

k = [1, 10, 50, 100, 200, 500, 1000, 2000]
explained_variance_ratio = []
train_list_LSI = []
test_list_LSI = []
for i in range(len(k)):
    LSI_model = TruncatedSVD(n_components=k[i], random_state=42)
    X_train_LSI = LSI_model.fit_transform(X_train_tfidf)
    X_test_LSI = LSI_model.transform(X_test_tfidf)
    train_list_LSI.append(X_train_LSI)
    test_list_LSI.append(X_test_LSI)
    explained_variance_ratio.append(sum(LSI_model.explained_variance_ratio_))

plt.plot(k,explained_variance_ratio) # explained_variance_ratio plot with different k
plt.show()

In [None]:
# NMF error

nmf = NMF(n_components=50, init='random', random_state=42)
train_matrix_NMF = nmf.fit_transform(X_train_tfidf) # performing NMF on the tfidf train matrix
test_matrix_NMF = nmf.transform(X_test_tfidf) # performing NMF on the tfidf test matrix
H = nmf.components_
print("Error for NMF: ", np.sum(np.array(X_train_tfidf - train_matrix_NMF.dot(H))**2))

In [None]:
# LSI error

LSI_model = TruncatedSVD(n_components=50, random_state=42)
train_matrix_LSI = LSI_model.fit_transform(X_train_tfidf)
test_matrix_LSI = LSI_model.transform(X_test_tfidf)
print("Error for LSI: ", np.sum(np.array(X_train_tfidf - train_matrix_LSI @ LSI_model.components_)**2))  

In [None]:
import seaborn as sns

def plot_roc(fpr, tpr):
    fig, ax = plt.subplots()

    roc_auc = auc(fpr,tpr)

    ax.plot(fpr, tpr, lw=2, label= 'area under curve = %0.4f' % roc_auc)

    ax.grid(color='0.7', linestyle='--', linewidth=1)

    ax.set_xlim([-0.1, 1.1])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate',fontsize=15)
    ax.set_ylabel('True Positive Rate',fontsize=15)

    ax.legend(loc="lower right")

    for label in ax.get_xticklabels()+ax.get_yticklabels():
        label.set_fontsize(15)

def plot_confusion_matrix(matrix,title):
    f,ax = plt.subplots()
    sns.heatmap(matrix,annot=True,ax=ax,fmt='.20g') #画热力图

    ax.set_title('confusion matrix of ' + title,fontsize=13) #标题
    ax.set_xlabel('predict',fontsize=13) #x轴
    ax.set_ylabel('true',fontsize=13) #y轴


## Question 5

Linear SVM: soft margin V.S. hard margin

In [None]:
# hard-margin SVM, soft-margin SVM and harder-margin SVM performances contract

hard_LSVC = LinearSVC(C=1000,random_state=42)
soft_LSVC = LinearSVC(C=0.0001,random_state=42)
harder_LSVC = LinearSVC(C=100000,random_state=42)

train_label = [1 if i == 'sports' else 0 for i in train['root_label']]
test_label = [1 if i == 'sports' else 0 for i in test['root_label']]

hard_LSVC.fit(train_matrix_LSI,train_label)
soft_LSVC.fit(train_matrix_LSI,train_label)
harder_LSVC.fit(train_matrix_LSI,train_label)

hard_pred = hard_LSVC.predict(test_matrix_LSI)
soft_pred = soft_LSVC.predict(test_matrix_LSI)
harder_pred = hard_LSVC.predict(test_matrix_LSI)

hard_confusion_matrix = confusion_matrix(test_label,hard_pred)
soft_confusion_matrix = confusion_matrix(test_label,soft_pred)
harder_confusion_matrix = confusion_matrix(test_label,harder_pred)

print('hard_confusion_matrix:\n',hard_confusion_matrix)
print('soft_confusion_matrix:\n',soft_confusion_matrix)
print('harder_confusion_matrix:\n',harder_confusion_matrix)
print('\n')

hard_acc = accuracy_score(test_label,hard_pred)
soft_acc = accuracy_score(test_label,soft_pred)
harder_acc = accuracy_score(test_label,harder_pred)

print('hard acc: ',hard_acc)
print('soft acc: ',soft_acc)
print('harder acc: ',harder_acc)
print('\n')

hard_recall = recall_score(test_label,hard_pred)
soft_recall = recall_score(test_label,soft_pred)
harder_recall = recall_score(test_label,harder_pred)

print('hard recall: ',hard_recall)
print('soft recall: ',soft_recall)
print('harder recall: ',harder_recall)
print('\n')

hard_precision = precision_score(test_label,hard_pred)
soft_precision = precision_score(test_label,soft_pred)
harder_precision = precision_score(test_label,harder_pred)

print('hard precision: ',hard_precision)
print('soft precision: ',soft_precision)
print('harder precision: ',harder_precision)
print('\n')

hard_f1 = f1_score(test_label,hard_pred)
soft_f1 = f1_score(test_label,soft_pred)
harder_f1 = f1_score(test_label,harder_pred)

print('hard f1: ',hard_f1)
print('soft f1: ',soft_f1)
print('harder f1: ',harder_f1)
print('\n')

fpr_hard, tpr_hard, _ = roc_curve(test_label,hard_LSVC.decision_function(test_matrix_LSI))
fpr_soft, tpr_soft, _ = roc_curve(test_label,soft_LSVC.decision_function(test_matrix_LSI))
fpr_harder, tpr_harder, _ = roc_curve(test_label,harder_LSVC.decision_function(test_matrix_LSI))

plot_roc(fpr_hard, tpr_hard)
plot_roc(fpr_soft, tpr_soft)
plot_roc(fpr_harder, tpr_harder)

In [None]:
plot_confusion_matrix(confusion_matrix(test_label,hard_pred),'SVM with gamma = 1000')
plot_confusion_matrix(confusion_matrix(test_label,soft_pred),'SVM with gamma = 0.0001')
plot_confusion_matrix(confusion_matrix(test_label,harder_pred),'SVM with gamma = 100000')

Cross-validation to find best gamma for SVM

In [None]:
# cross-validation for choosing best C(gamma) for LSVC model using gridsearch

svc_C = LinearSVC(random_state=42)
hyperparameter = {'C':[0.001,0.01,0.1,1,10,1e2,1e3,1e4,1e5,1e6]}
svc_C_clf = GridSearchCV(svc_C,hyperparameter,cv=5,scoring='accuracy')
svc_C_clf.fit(train_matrix_LSI,train_label)

print('best hyperparameter gamma: ',svc_C_clf.best_estimator_.C)
best_pred = svc_C_clf.best_estimator_.predict(test_matrix_LSI)

print('best confusion matrix: \n',confusion_matrix(test_label,best_pred))
print('best acc: ',accuracy_score(test_label,best_pred))
print('best recall: ',recall_score(test_label,best_pred))
print('best precision: ',precision_score(test_label,best_pred))
print('best f1: ',f1_score(test_label,best_pred))

fpr_best, tpr_best, _ = roc_curve(test_label,svc_C_clf.best_estimator_.decision_function(test_matrix_LSI))
plot_roc(fpr_best, tpr_best)
plot_confusion_matrix(confusion_matrix(test_label,best_pred),'best SVM')

## Question 6

Logistic regression without regularization

In [None]:
# logistic regression without penalty for classification
clf_lr = LogisticRegression(penalty='none',random_state=42)
clf_lr.fit(train_matrix_LSI,train_label)

lr_pred = clf_lr.predict(test_matrix_LSI)

print('lr confusion matrix: \n',confusion_matrix(test_label,lr_pred))
print('lr acc: ',accuracy_score(test_label,lr_pred))
print('lr recall: ',recall_score(test_label,lr_pred))
print('lr precision: ',precision_score(test_label,lr_pred))
print('lr f1: ',f1_score(test_label,lr_pred))

fpr_lr, tpr_lr, _ = roc_curve(test_label,clf_lr.decision_function(test_matrix_LSI))
plot_roc(fpr_lr, tpr_lr)
plot_confusion_matrix(confusion_matrix(test_label,lr_pred),'LogisticRegression w/o regularization')

L1 and L2 logistic regression and find their best hyperparameters 

In [None]:
# Find best hyperparameter for logistic regression with l1 and l2 penalty respectively
lr_l1 = LogisticRegression(solver='liblinear',penalty='l1',random_state=42)
hyperparameter = {'C':[1e-5,1e-4,1e-3,1e-2,1e-1,1,1e1,1e2,1e3,1e4,1e5]}
lr_l1_clf = GridSearchCV(lr_l1,hyperparameter,cv=5,scoring='accuracy')
lr_l1_clf.fit(train_matrix_NMF,train_label)

print('best regularization strength for l1: ',lr_l1_clf.best_estimator_.C)
print(accuracy_score(test_label,lr_l1_clf.best_estimator_.predict(test_matrix_NMF)))

lr_l2 = LogisticRegression(solver='liblinear',penalty='l2',random_state=42)
hyperparameter = {'C':[1e-5,1e-4,1e-3,1e-2,1e-1,1,1e1,1e2,1e3,1e4,1e5]}
lr_l2_clf = GridSearchCV(lr_l2,hyperparameter,cv=5,scoring='accuracy')
lr_l2_clf.fit(train_matrix_NMF,train_label)

print('best regularization strength for l2: ',lr_l2_clf.best_estimator_.C)
print(accuracy_score(test_label,lr_l1_clf.best_estimator_.predict(test_matrix_NMF)))


Compare 3 logistic regression classifiers

In [None]:
pred_l1 = lr_l1_clf.best_estimator_.predict(test_matrix_NMF)
pred_l2 = lr_l2_clf.best_estimator_.predict(test_matrix_NMF)

print('lr_no_penalty acc: ',accuracy_score(test_label,lr_pred))
print('lr_l1 acc: ',accuracy_score(test_label,pred_l1))
print('lr_l2 acc: ',accuracy_score(test_label,pred_l2))

print('lr_no_penalty recall: ',recall_score(test_label,lr_pred))
print('lr_l1 recall: ',recall_score(test_label,pred_l1))
print('lr_l2 recall: ',recall_score(test_label,pred_l2))

print('lr_no_penalty precision: ',precision_score(test_label,lr_pred))
print('lr_l1 precision: ',precision_score(test_label,pred_l1))
print('lr_l2 precision: ',precision_score(test_label,pred_l2))

print('lr_no_penalty f1: ',f1_score(test_label,lr_pred))
print('lr_l1 f1: ',f1_score(test_label,pred_l1))
print('lr_l2 f1: ',f1_score(test_label,pred_l2))

## Question 7

Bayes classifier

In [None]:
# bayes for classification
gnb = GaussianNB()
gnb.fit(train_matrix_LSI,train_label)
pred_gnb = gnb.predict(test_matrix_LSI)

print('gnb confusion matrix: \n',confusion_matrix(test_label,pred_gnb))
print('gnb acc: ',accuracy_score(test_label,pred_gnb))
print('gnb recall: ',recall_score(test_label,pred_gnb))
print('gnb precision: ',precision_score(test_label,pred_gnb))
print('gnb f1: ',f1_score(test_label,pred_gnb))

fpr_gnb, tpr_gnb, _ = roc_curve(test_label,gnb.predict_proba(test_matrix_LSI)[:,1])
plot_roc(fpr_lr, tpr_lr)
plot_confusion_matrix(confusion_matrix(test_label,pred_gnb),'Naive Bayes Model')


## Question 8

Grid search for best model

In [None]:
# pipeline and gridsearch to find best hyperparameters combination 
def use_lemma(text):
    lemma = WordNetLemmatizer()
    words_list = []
    for word,tag in pos_tag(word_tokenize(text)):
        words_list.append(lemma.lemmatize(word=word.lower(),pos=transform_tag(tag)))
    return words_list

def use_stem(text):
    ps = nltk.stem.PorterStemmer()
    word_list = []
    for word in word_tokenize(text):
        word_list.append(ps.stem(word.lower()))

cachedir = mkdtemp()
memory = Memory(location=cachedir, verbose=10)

pipe = Pipeline([
    ('Feature_Extraction',TfidfVectorizer(stop_words='english')),
    ('Dimensionality_Reduction',None),
    ('Classifier',None)],memory=memory)

param_grid = {
    'Feature_Extraction__min_df': (3,5),
    'Feature_Extraction__analyzer':(use_lemma,use_stem),
    'Dimensionality_Reduction':(TruncatedSVD(n_components=5,random_state=42), TruncatedSVD(n_components=30,random_state=42), 
                TruncatedSVD(n_components=80,random_state=42), NMF(n_components=5,init='random', random_state=42),
                NMF(n_components=30,init='random', random_state=42), NMF(n_components=80,init='random', random_state=42)),
    'Classifier':(LinearSVC(C=svc_C_clf.best_estimator_.C,random_state=42), LogisticRegression(solver='liblinear',penalty='l2',C=lr_l2_clf.best_estimator_.C,random_state=42),
                LogisticRegression(solver='liblinear',penalty='l1',C=lr_l1_clf.best_estimator_.C,random_state=42), GaussianNB())
    }

grid = GridSearchCV(pipe,cv=5,param_grid=param_grid,scoring='accuracy')

data = pd.read_csv('./Project1-Classification.csv')
train, test = train_test_split(data[["full_text","root_label"]], test_size=0.2, random_state=42)
train.applymap(clean)
test.applymap(clean)

grid.fit(train['full_text'], train['root_label'])
print("Best score for pipeline: ", grid.best_score_)
print("Best params for pipeline: ", grid.best_params_)
print("Best estimator for pipeline: ", grid.best_estimator_)

Select top 5 models

In [None]:
import math
li = np.array([i if str(i) != 'nan' else 0 for i in grid.cv_results_['mean_test_score']])
arg = li.argsort()[-5:]
for i in range(len(arg)):

    print(grid.cv_results_['params'][arg[4-i]])
    print(grid.cv_results_['mean_test_score'][arg[4-i]])
    print('\n')


Top 2-5 models performances on test data

In [None]:
data = pd.read_csv('./Project1-Classification.csv')
train, test = train_test_split(data[["full_text","root_label"]], test_size=0.2, random_state=42)
train.applymap(clean)
test.applymap(clean)

train_c = train.copy()
test_c  = test.copy()
for i in range(len(train_c)):
    text = train_c.iloc[i].at['full_text']
    lemma_words = use_lemma(text)
    train_c.iloc[i].at['full_text'] = ' '.join(lemma_words)

for i in range(len(test_c)):
    text = test.iloc[i].at['full_text']
    lemma_words = use_lemma(text)
    test_c.iloc[i].at['full_text'] = ' '.join(lemma_words)

tfidf = TfidfVectorizer(stop_words='english',min_df=3)
train_X =  tfidf.fit_transform(train_c['full_text'])
test_X = tfidf.transform(test_c['full_text'])

svd = TruncatedSVD(n_components=80, random_state=42)
train_X = svd.fit_transform(train_X)
test_X = svd.transform(test_X)

train_Y = train_c['root_label']
test_Y = test_c['root_label']

lsvc = LogisticRegression(C=100000.0, penalty='l2', random_state=42, solver='liblinear')
lsvc.fit(train_X,train_Y)
pr = lsvc.predict(test_X)

print('acc: ',accuracy_score(test_Y,pr))
print('recall: ',recall_score(test_Y,pr, pos_label='sports'))
print('precision: ',precision_score(test_Y,pr,pos_label='sports'))
print('f1: ',f1_score(test_Y,pr,pos_label='sports'))



Best model performance

In [None]:
best_pre = grid.best_estimator_.predict(test['full_text'])

print('best confusion matrix: \n',confusion_matrix(test['root_label'],best_pre))
print('best acc: ',accuracy_score(test['root_label'],best_pre))
print('best recall: ',recall_score(test['root_label'],best_pre, pos_label='sports'))
print('best precision: ',precision_score(test['root_label'],best_pre,pos_label='sports'))
print('best f1: ',f1_score(test['root_label'],best_pre,pos_label='sports'))

fpr_best, tpr_best, _ = roc_curve(test['root_label'],grid.best_estimator_.predict_proba(test['full_text'])[:,1],pos_label='sports')
plot_roc(fpr_lr, tpr_lr)

## Question 9

Bayes, OneVsOne SVM and OneVsRest SVM

In [None]:
# bayes classifier, one_vs_one classifier and one_vs_rest classifier performances compare
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

data = pd.read_csv('./Project1-Classification.csv')
train, test = train_test_split(data[["full_text","leaf_label"]], test_size=0.2, random_state=42)
train.applymap(clean)
test.applymap(clean)

for i in range(len(train)):
    text = train.iloc[i].at['full_text']
    lemma_words = lemmatizer(text)
    lemma_words = [i for i in lemma_words if i not in string.punctuation] 
    lemma_words = [i for i in lemma_words if not i.isdigit()]
    train.iloc[i].at['full_text'] = ' '.join(lemma_words)

for i in range(len(test)):
    text = test.iloc[i].at['full_text']
    lemma_words = lemmatizer(text)
    lemma_words = [i for i in lemma_words if i not in string.punctuation] 
    lemma_words = [i for i in lemma_words if not i.isdigit()]
    test.iloc[i].at['full_text'] = ' '.join(lemma_words) 

tfidf = TfidfTransformer()
vectorizer = CountVectorizer(min_df=3, stop_words='english')

X_train_counts = vectorizer.fit_transform(train['full_text'])
X_train_tfidf = tfidf.fit_transform(X_train_counts)
X_test_counts = vectorizer.transform(test['full_text'])
X_test_tfidf = tfidf.transform(X_test_counts)

LSI_model = TruncatedSVD(n_components=50, random_state=42)
train_matrix_LSI = LSI_model.fit_transform(X_train_tfidf)
test_matrix_LSI = LSI_model.transform(X_test_tfidf)


In [None]:
train_label = train['leaf_label']
test_label = test['leaf_label']

map_row_to_class = {0:"chess", 1:"cricket", 2:"hockey", 3:"soccer",
4:"football", 5:r"%22forest%20fire%22", 6:"flood", 7:"earthquake",
8:"drought"}

In [None]:
copy_train = train.copy()
copy_test = test.copy()

In [None]:
labels_order = list(map_row_to_class.values())
labels_order

In [None]:
gnb = GaussianNB()
gnb.fit(train_matrix_LSI,train_label)
gnb_pre = gnb.predict(test_matrix_LSI)

print('gnb confusion matrix: \n',confusion_matrix(test_label,gnb_pre,labels=labels_order))
print('gnb acc: ',accuracy_score(test_label,gnb_pre))
print('gnb recall: ',recall_score(test_label,gnb_pre,average='weighted',labels=labels_order))
print('gnb precision: ',precision_score(test_label,gnb_pre,average='weighted',labels=labels_order))
print('gnb f1: ',f1_score(test_label,gnb_pre,average='weighted',labels=labels_order))

In [None]:
one_vs_one = OneVsOneClassifier(LinearSVC(C = 100,random_state=42))
one_vs_one.fit(train_matrix_LSI,train_label)
one_pre = one_vs_one.predict(test_matrix_LSI)

print('one_vs_one confusion matrix: \n',confusion_matrix(test_label,one_pre,labels=labels_order))
print('one_vs_one acc: ',accuracy_score(test_label,one_pre))
print('one_vs_one recall: ',recall_score(test_label,one_pre,average='weighted',labels=labels_order))
print('one_vs_one precision: ',precision_score(test_label,one_pre,average='weighted',labels=labels_order))
print('one_vs_one f1: ',f1_score(test_label,one_pre,average='weighted',labels=labels_order))

In [None]:
one_vs_rest = OneVsRestClassifier(LinearSVC(C=100,random_state=42))
one_vs_rest.fit(train_matrix_LSI,train_label)
rest_pre = one_vs_rest.predict(test_matrix_LSI)

print('one_vs_rest confusion matrix: \n',confusion_matrix(test_label,rest_pre,labels=labels_order))
print('one_vs_rest acc: ',accuracy_score(test_label,rest_pre))
print('one_vs_rest recall: ',recall_score(test_label,rest_pre,average='weighted',labels=labels_order))
print('one_vs_rest precision: ',precision_score(test_label,rest_pre,average='weighted',labels=labels_order))
print('one_vs_rest f1: ',f1_score(test_label,rest_pre,average='weighted',labels=labels_order))

Merging classes

In [None]:
# merging classes
for i in range(copy_train.shape[0]):
    if copy_train.iloc[i].at['leaf_label'] == 'soccer':
        copy_train.iloc[i].at['leaf_label'] = 'football'

for i in range(copy_test.shape[0]):
    if copy_test.iloc[i].at['leaf_label'] == 'soccer':
        copy_test.iloc[i].at['leaf_label'] = 'football'

merged_train_label = copy_train['leaf_label']
merged_test_label = copy_test['leaf_label']

merged_map_to_class = {0:"chess", 1:"cricket", 2:"hockey", 3:"football",
4:r"%22forest%20fire%22", 5:"flood", 6:"earthquake",
7:"drought"}

merged_labels_order = list(merged_map_to_class.values())

one_vs_one = OneVsOneClassifier(LinearSVC(C=100,random_state=42))
one_vs_one.fit(train_matrix_LSI,merged_train_label)
one_pre = one_vs_one.predict(test_matrix_LSI)

print('one_vs_one confusion matrix: \n',confusion_matrix(merged_test_label,one_pre,labels=merged_labels_order))
print('one_vs_one acc: ',accuracy_score(merged_test_label,one_pre))
print('one_vs_one recall: ',recall_score(merged_test_label,one_pre,average='weighted',labels=merged_labels_order))
print('one_vs_one precision: ',precision_score(merged_test_label,one_pre,average='weighted',labels=merged_labels_order))
print('one_vs_one f1: ',f1_score(merged_test_label,one_pre,average='weighted',labels=merged_labels_order))

one_vs_rest = OneVsRestClassifier(LinearSVC(C=100,random_state=42))
one_vs_rest.fit(train_matrix_LSI,merged_train_label)
rest_pre = one_vs_rest.predict(test_matrix_LSI)

print('one_vs_rest confusion matrix: \n',confusion_matrix(merged_test_label,rest_pre,labels=merged_labels_order))
print('one_vs_rest acc: ',accuracy_score(merged_test_label,rest_pre))
print('one_vs_rest recall: ',recall_score(merged_test_label,rest_pre,average='weighted',labels=merged_labels_order))
print('one_vs_rest precision: ',precision_score(merged_test_label,rest_pre,average='weighted',labels=merged_labels_order))
print('one_vs_rest f1: ',f1_score(merged_test_label,rest_pre,average='weighted',labels=merged_labels_order))

Use balanced class_weight

In [None]:
# resolve class imbalance
from sklearn.utils import class_weight

one_vs_one = OneVsOneClassifier(LinearSVC(C=100,random_state=42,class_weight='balanced'))
one_vs_one.fit(train_matrix_LSI,merged_train_label)
one_pre = one_vs_one.predict(test_matrix_LSI)

print('one_vs_one confusion matrix: \n',confusion_matrix(merged_test_label,one_pre,labels=merged_labels_order))
print('one_vs_one acc: ',accuracy_score(merged_test_label,one_pre))
print('one_vs_one recall: ',recall_score(merged_test_label,one_pre,average='weighted',labels=merged_labels_order))
print('one_vs_one precision: ',precision_score(merged_test_label,one_pre,average='weighted',labels=merged_labels_order))
print('one_vs_one f1: ',f1_score(merged_test_label,one_pre,average='weighted',labels=merged_labels_order)) 

one_vs_rest = OneVsRestClassifier(LinearSVC(C=100,random_state=42,class_weight='balanced'))
one_vs_rest.fit(train_matrix_LSI,merged_train_label)
rest_pre = one_vs_rest.predict(test_matrix_LSI)

print('one_vs_rest confusion matrix: \n',confusion_matrix(merged_test_label,rest_pre,labels=merged_labels_order))
print('one_vs_rest acc: ',accuracy_score(merged_test_label,rest_pre))
print('one_vs_rest recall: ',recall_score(merged_test_label,rest_pre,average='weighted',labels=merged_labels_order))
print('one_vs_rest precision: ',precision_score(merged_test_label,rest_pre,average='weighted',labels=merged_labels_order))
print('one_vs_rest f1: ',f1_score(merged_test_label,rest_pre,average='weighted',labels=merged_labels_order))

## Question 11


GLoVE + SVM pipline

In [None]:
# glove
import ast

train, test = train_test_split(data[["keywords","root_label"]], test_size=0.2, random_state=42)
train.applymap(clean)
test.applymap(clean)

X_train = []
Y_train = []

res_acc = []
dimension_of_glove = [50,100,200,300]
for dim in dimension_of_glove:
    embeddings_dict = {}
    with open("./glove.6B."+ str(dim) + "d.txt", 'rb') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector

    X = []

    for i in range(train.shape[0]):
        vect = [0 for k in range(dim)]
        keywords_str = train.iloc[i].at['keywords']
        keywords_list = ast.literal_eval(keywords_str)
        length = 0
        for j in range(len(keywords_list)):
            if bytes(keywords_list[j],'utf-8') in embeddings_dict.keys():
                vect += embeddings_dict[bytes(keywords_list[j],'utf-8')]
                length += 1
        vect /= length
        if(np.linalg.norm(vect) != 0):
            vect = vect / np.linalg.norm(vect)
        X.append(vect)

    X_train.append(X)

    Y = train['root_label']
    Y_train.append(Y)

    X_test = []
    for i in range(test.shape[0]):
        vect = [0 for k in range(dim)]
        keywords_str = test.iloc[i].at['keywords']
        keywords_list = ast.literal_eval(keywords_str)
        length = 0
        for j in range(len(keywords_list)):
            if bytes(keywords_list[j],'utf-8') in embeddings_dict.keys():
                vect += embeddings_dict[bytes(keywords_list[j],'utf-8')]
                length += 1
        vect /= length
        if(np.linalg.norm(vect) != 0):
            vect = vect / np.linalg.norm(vect)
        X_test.append(vect)

    Y_test = test['root_label']


    svm = LinearSVC(random_state=42)
    svm.fit(X,Y)
    pred = svm.predict(X_test)
    print(str(dim)+" dimension confusion matrix: \n", confusion_matrix(Y_test,pred))
    print(str(dim)+" dimension acc: ",accuracy_score(Y_test,pred))
    print(str(dim)+" dimension recall: ",recall_score(Y_test,pred,pos_label='sports'))
    print(str(dim)+" dimension precision: ",precision_score(Y_test,pred,pos_label='sports'))
    print(str(dim)+" dimension F-1 score: ",f1_score(Y_test,pred,pos_label='sports'))
    if dim == 300:
        plot_confusion_matrix(confusion_matrix(Y_test,pred),title=str(dim)+' dimension GLoVE')
        fpr_glove, tpr_glove, _ = roc_curve(Y_test,svm.decision_function(X_test),pos_label='sports')
        plot_roc(fpr_glove, tpr_glove)    
    res_acc.append(accuracy_score(Y_test,pred))

## Question 12

In [None]:
plt.plot(dimension_of_glove,res_acc)
plt.xlabel('Dimension of GLoVE embedding',fontsize=14)
plt.ylabel('Accuracy of SVM classifier',fontsize=14)
plt.show()

## Question 13

In [None]:
from umap import UMAP

reducer = UMAP(random_state=42)
embedding = reducer.fit_transform(X_train[3])

color = [0 if i == "sports" else 1 for i in Y_train[3]] 

plt.scatter(embedding[:, 0], embedding[:, 1], c=color, cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.title('UMAP projection of the dataset')
plt.show()

In [None]:
X_random = np.random.rand(len(X_train[3]), len(X_train[3][0]))
reducer = UMAP(random_state=42)
embedding = reducer.fit_transform(X_random)

plt.scatter(embedding[:, 0], embedding[:, 1], c=color, cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.title('UMAP projection of random')
plt.show()