# Task 1 Text Classification
## Dataset: Fetch 20 Newsgroups (same as in class work)​

## Algorithms: Multinomial Naïve Bayes, Logistic Regression, Support Vector Machines, Decision Trees​

## Feature Extractors: CountVectorizer, Word2Vec, Doc2Vec and so on

### Import all the necessary libraries

In [1]:
from pprint import pprint
import pandas as pd
from time import time
import logging
import re
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.impute import SimpleImputer

### Choose a few categories fro the entire 20 categories

In [2]:
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]

In [3]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']


### Fetch documents for these 2 categories

In [4]:
data = fetch_20newsgroups(subset='train', categories=categories)
print(f"{len(data.filenames)} documents")
print(f"{len(data.target_names)} categories")
print()

857 documents
2 categories



In [5]:
# load dataset
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories = categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories = categories)
newsgroups_train.data
newsgroups_train.target

array([0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,

In [6]:
X_train,X_test,y_train,y_test = train_test_split(newsgroups_train.data, newsgroups_train.target, test_size=0.3, random_state = 42)

### Define a pipeline combining a text feature extractor with a simple classifier
## Logistic Regression with CountVectorizer

In [7]:
pipeline_LR = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression()),
])

In [8]:
pipeline_LR.fit(X_train,y_train)
y_pred=pipeline_LR.predict(X_test)
acc_lr_cv = accuracy_score(y_test, y_pred)
print("Accuracy_Score", acc_lr_cv)

Accuracy_Score 0.7170542635658915


## Multinomial Naïve Bayes with CountVectorizer

In [9]:
pipeline_Multi = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',  MultinomialNB()),
])

In [10]:
pipeline_Multi.fit(X_train,y_train)
y_pred=pipeline_Multi.predict(X_test)
acc_Multi_cv = accuracy_score(y_test, y_pred)
print("Accuracy_Score", acc_Multi_cv)

Accuracy_Score 0.5581395348837209


##  Support Vector Machines with CountVectorizer

In [11]:
pipeline_SVC = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',  SVC()),
])

In [12]:
pipeline_SVC.fit(X_train,y_train)
y_pred=pipeline_SVC.predict(X_test)
acc_SVC_cv = accuracy_score(y_test, y_pred)
print("Accuracy_Score", acc_SVC_cv)

Accuracy_Score 0.7248062015503876


##  Decision Trees with CountVectorizer

In [13]:
pipeline_Decision = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', DecisionTreeClassifier()),
])

In [14]:
pipeline_Decision.fit(X_train,y_train)
y_pred=pipeline_Decision.predict(X_test)
acc_Decision_cv = accuracy_score(y_test, y_pred)
print("Accuracy_Score", acc_Decision_cv)

Accuracy_Score 0.6046511627906976


# Word2Vec

In [15]:
model = KeyedVectors.load_word2vec_format('C:/Users/chail/googlenews/GoogleNews-vectors-negative300.bin.gz',binary=True)
print('done loading word2vec')

done loading word2vec


In [16]:
def embedding_feats(list_of_lists):
    DIMENSION = 300
    zero_vector = np.zeros(DIMENSION)
    feats = []
    for tokens in list_of_lists:
        feat_for_this = np.zeros(DIMENSION)
        count_for_this = 0
        for token in tokens :
            if token in model:
                feat_for_this += model[token]
                count_for_this += 1
        feats.append(feat_for_this/count_for_this)
    return np.array(feats, dtype=float)



In [17]:
X_train_feats = embedding_feats(X_train)
X_test_feats = embedding_feats(X_test)
print(len(X_train_feats))
print(len(X_test_feats))
#X_train_feats
#X_test_feats

  feats.append(feat_for_this/count_for_this)


599
258


In [18]:
imputer = SimpleImputer(strategy='mean')
imputer.fit(X_train_feats)
X_train_feats_scale = imputer.transform(X_train_feats)
X_test_feats_scale = imputer.transform(X_test_feats)

In [19]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_feats_scale_minmax = scaler.fit_transform(X_train_feats_scale)
X_test_feats_scale_minmax = scaler.transform(X_test_feats_scale)


## MultinomialNB with Word2Vec

In [20]:
clf = MultinomialNB()
clf.fit(X_train_feats_scale_minmax, y_train)
y_pred = clf.predict(X_test_feats_scale_minmax)
acc_mnb_w2v = accuracy_score(y_test, y_pred)
print("Accuracy_Score_w2v_mnb", acc_mnb_w2v)

Accuracy_Score_w2v_mnb 0.5310077519379846


## Logistic Regression with Word2Vec

In [21]:
lr = LogisticRegression()
lr.fit(X_train_feats_scale, y_train)
y_pred = lr.predict(X_test_feats_scale)
acc_lr_w2v = accuracy_score(y_test, y_pred)
print("Accuracy_Score_w2v_lr", acc_lr_w2v)

Accuracy_Score_w2v_lr 0.5581395348837209


## SVM with Word2Vec

In [22]:
#count vectoizer and svm
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train_feats_scale, y_train)
y_pred = svm_model.predict(X_test_feats_scale)
acc_svm_w2v = accuracy_score(y_test, y_pred)
print("Accuracy_Score_w2v_svm", acc_svm_w2v)

Accuracy_Score_w2v_svm 0.5465116279069767


## Decision Tree with Word2Vec

In [23]:
dt = DecisionTreeClassifier()
dt.fit(X_train_feats_scale, y_train)
y_pred = dt.predict(X_test_feats_scale)
acc_dt_w2v = accuracy_score(y_test, y_pred)
print("Accuracy_Score_w2v", acc_dt_w2v)

Accuracy_Score_w2v 0.5348837209302325


# Table for best combination:

In [24]:
data_w2v = {
    'Model': ['MultinomialNB', 'LogisticRegression', 'SVM', 'DecisionTree'],
    'Accuracy Score using Word2Vec': [acc_mnb_w2v, acc_lr_w2v, acc_svm_w2v, acc_dt_w2v]
}
data_cv = {'Model': ['MultinomialNB', 'LogisticRegression', 'SVM', 'DecisionTree'],
    'Accuracy Score using Count Vectorizer': [acc_Multi_cv, acc_lr_cv, acc_SVC_cv, acc_Decision_cv]
}

df = pd.concat([pd.DataFrame(data_w2v), pd.DataFrame(data_cv)], axis=1)

df

Unnamed: 0,Model,Accuracy Score using Word2Vec,Model.1,Accuracy Score using Count Vectorizer
0,MultinomialNB,0.531008,MultinomialNB,0.55814
1,LogisticRegression,0.55814,LogisticRegression,0.717054
2,SVM,0.546512,SVM,0.724806
3,DecisionTree,0.534884,DecisionTree,0.604651
