## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from plots_fabi import *
import sys
# Add functions path
sys.path.append('../Functions')
from time import time

## Load Dataset

In [2]:
from datasets import load_stratified_dataset
df = load_stratified_dataset(path='../Datasets/dataset_categories/dataset_big.csv', labels='category', samples_per_label=1000, random_seed=11)

In [3]:
# TfidfVectorizer generates bag of words
from sklearn.feature_extraction.text import TfidfVectorizer

# sublinear_tf: use logarithmic form for frequency
# min_df: minimum numbers of documents a word must be present to keep it
# ngram_range: number of ngrams to use
# stopwords: remove all common pronouns in given language

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1',
                        ngram_range=(1, 2), stop_words='english')

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text_lem'], df['category'], random_state = 42)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

### GridSearch

In [5]:
from sklearn.model_selection import GridSearchCV

#### Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression

In [7]:
parameters = {'penalty': ['l1', 'l2', 'elasticnet'], 'C': np.logspace(-1,1,10), 'multi_class': ['auto', 'ovr', 'multinomial']}
clf = GridSearchCV(LogisticRegression(max_iter = 10000, random_state=42), parameters, n_jobs=4, cv=5)
clf.fit(X_train_tfidf, y_train);
clf.best_params_

{'C': 5.994842503189409, 'multi_class': 'ovr', 'penalty': 'l2'}

In [8]:
clf.score(X_train_tfidf, y_train), clf.score(X_test_tfidf, y_test)

(0.975047619047619, 0.7331428571428571)

#### LinearSVC

In [6]:
from sklearn.svm import LinearSVC

In [10]:
parameters = {'penalty': ['l1', 'l2'], 'C': np.logspace(-1,1,10), 'dual': [False, True], 'loss': ['hinge', 'squared_hinge']}
clf = GridSearchCV(LinearSVC(max_iter = 10000, random_state=42), parameters, n_jobs=4, cv=5)
clf.fit(X_train_tfidf, y_train);
clf.best_params_

{'C': 0.46415888336127786,
 'dual': False,
 'loss': 'squared_hinge',
 'penalty': 'l2'}

In [11]:
clf.score(X_train_tfidf, y_train), clf.score(X_test_tfidf, y_test)

(0.9737142857142858, 0.7325714285714285)

#### Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
parameters = {'n_estimators': [100, 400, 700, 1000], 'criterion': ['gini','entropy'], 'max_depth': [None, 10, 30, 50, 80], 'min_samples_split': [2, 6, 10, 15]}
clf = GridSearchCV(RandomForestClassifier(random_state=42), parameters, n_jobs=4, cv=5)
clf.fit(X_train_tfidf, y_train)
clf.best_params_

{'criterion': 'gini',
 'max_depth': 80,
 'min_samples_split': 6,
 'n_estimators': 400}

In [31]:
clf.score(X_train_tfidf, y_train), clf.score(X_test_tfidf, y_test)

(0.9950476190476191, 0.7017142857142857)

### Full data optimization

In [8]:
df = load_stratified_dataset(path='../Datasets/dataset_categories/dataset_big.csv', labels='category', samples_per_label=99000, random_seed=42)

Smallest sample size in dataset is 12525 samples!


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text_lem'], df['category'], random_state = 42)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

#### Naive Bayes

In [12]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(nb.score(X_train_tfidf, y_train)*100, nb.score(X_test_tfidf, y_test)*100))

Training accuracy: 78.03%;  Test accuracy: 73.83%


#### Logistic Regression

In [17]:
lr = LogisticRegression(max_iter=10000, random_state=42, C=5.99, multi_class= 'ovr', penalty='l2')
lr.fit(X_train_tfidf, y_train)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(lr.score(X_train_tfidf, y_train)*100, lr.score(X_test_tfidf, y_test)*100))

Training accuracy: 96.08%;  Test accuracy: 80.85%


#### LinearSVC

In [14]:
svc = LinearSVC(max_iter=10000, random_state=42, C=0.46, dual=False, loss='squared_hinge', penalty='l2')
svc.fit(X_train_tfidf, y_train)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(svc.score(X_train_tfidf, y_train)*100, svc.score(X_test_tfidf, y_test)*100))

Training accuracy: 96.21%;  Test accuracy: 81.37%


#### Random Forest

In [15]:
rf = RandomForestClassifier(random_state=42, criterion='gini', max_depth=80, min_samples_split=6, n_estimators= 400)
rf.fit(X_train_tfidf, y_train)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(rf.score(X_train_tfidf, y_train)*100, rf.score(X_test_tfidf, y_test)*100))

Training accuracy: 97.96%;  Test accuracy: 74.58%


***
# Ensemble

In [18]:
from sklearn.ensemble import VotingClassifier
voter = VotingClassifier(estimators=[('nb', nb), ('lr', lr), ('svc', svc), ('rf', rf)], voting='hard')

In [20]:
voter.fit(X_train_tfidf, y_train)

VotingClassifier(estimators=[('nb', MultinomialNB()),
                             ('lr',
                              LogisticRegression(C=5.99, max_iter=10000,
                                                 multi_class='ovr',
                                                 random_state=42)),
                             ('svc',
                              LinearSVC(C=0.46, dual=False, max_iter=10000,
                                        random_state=42)),
                             ('rf',
                              RandomForestClassifier(max_depth=80,
                                                     min_samples_split=6,
                                                     n_estimators=400,
                                                     random_state=42))])

In [21]:
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(voter.score(X_train_tfidf, y_train)*100, voter.score(X_test_tfidf, y_test)*100))

Training accuracy: 96.01%;  Test accuracy: 79.05%
