# Hyper parameter optimization
Use Grid search to find best parameters for models. Use full dataset to train final models.

### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from plots_fabi import *
import pickle
import sys
# Add functions path
sys.path.append('../Functions')
from time import time

### Load Dataset

In [2]:
from datasets import load_stratified_dataset
df = load_stratified_dataset(path='../Datasets/dataset_categories/dataset_categories_train.csv', labels='category', samples_per_label=1000, random_seed=11)

In [3]:
# TfidfVectorizer generates bag of words
from sklearn.feature_extraction.text import TfidfVectorizer

# sublinear_tf: use logarithmic form for frequency
# min_df: minimum numbers of documents a word must be present to keep it
# ngram_range: number of ngrams to use
# stopwords: remove all common pronouns in given language

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1',
                        ngram_range=(1, 3), stop_words=None, max_features=40000)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text_lem'], df['category'], random_state = 42)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

***
# GridSearch
Search for best hyperparameters.

In [5]:
from sklearn.model_selection import GridSearchCV

### Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
parameters = {'penalty': ['l1', 'l2', 'elasticnet'], 'C': np.logspace(-1,1,10), 'multi_class': ['auto', 'ovr', 'multinomial']}
clf = GridSearchCV(LogisticRegression(max_iter = 10000, random_state=42), parameters, n_jobs=4, cv=5)
clf.fit(X_train_tfidf, y_train);
clf.best_params_

{'C': 5.994842503189409, 'multi_class': 'ovr', 'penalty': 'l2'}

In [8]:
clf.score(X_train_tfidf, y_train), clf.score(X_test_tfidf, y_test)

(0.9765714285714285, 0.7462857142857143)

### LinearSVC

In [9]:
from sklearn.svm import LinearSVC

In [10]:
parameters = {'penalty': ['l1', 'l2'], 'C': np.logspace(-1,1,10), 'dual': [False, True], 'loss': ['hinge', 'squared_hinge']}
clf = GridSearchCV(LinearSVC(max_iter = 10000, random_state=42), parameters, n_jobs=4, cv=5)
clf.fit(X_train_tfidf, y_train);
clf.best_params_

{'C': 0.46415888336127786,
 'dual': False,
 'loss': 'squared_hinge',
 'penalty': 'l2'}

In [11]:
clf.score(X_train_tfidf, y_train), clf.score(X_test_tfidf, y_test)

(0.976, 0.7491428571428571)

### Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
parameters = {'n_estimators': [100, 400, 700, 1000], 'criterion': ['gini','entropy'], 'max_depth': [None, 10, 30, 50, 80], 'min_samples_split': [2, 6, 10, 15]}
clf = GridSearchCV(RandomForestClassifier(random_state=42), parameters, n_jobs=4, cv=5)
clf.fit(X_train_tfidf, y_train)
clf.best_params_

{'criterion': 'gini',
 'max_depth': None,
 'min_samples_split': 6,
 'n_estimators': 700}

In [14]:
clf.score(X_train_tfidf, y_train), clf.score(X_test_tfidf, y_test)

(0.996, 0.7091428571428572)

***
For __all models__ grid search does __not__ really seem to __improve__ accuracy. Probably since errors occur due to the dataset given. Further grid analysis is therefor not necessary.

***
## Full data optimization

In [15]:
df_train = load_stratified_dataset(path='../Datasets/dataset_categories/dataset_categories_train.csv', labels='category', samples_per_label=99000, random_seed=42)
df_test = pd.read_csv('../Datasets/dataset_categories/dataset_categories_test.csv')
df_train.shape, df_test.shape

Smallest sample size in dataset is 12026 samples!


((84181, 8), (3500, 8))

In [16]:
from sklearn.model_selection import train_test_split

X_train = df_train.text_lem
X_test = df_test.text_lem
y_train = df_train.category
y_test = df_test.category

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

#### Naive Bayes

In [17]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(nb.score(X_train_tfidf, y_train)*100, nb.score(X_test_tfidf, y_test)*100))

Training accuracy: 74.28%;  Test accuracy: 74.71%


#### Logistic Regression

In [18]:
lr = LogisticRegression(max_iter=10000, random_state=42, C=5.99, multi_class= 'ovr', penalty='l2')
lr.fit(X_train_tfidf, y_train)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(lr.score(X_train_tfidf, y_train)*100, lr.score(X_test_tfidf, y_test)*100))

Training accuracy: 92.52%;  Test accuracy: 82.80%


#### LinearSVC

In [19]:
svc = LinearSVC(max_iter=10000, random_state=42, C=0.46, dual=False, loss='squared_hinge', penalty='l2')
svc.fit(X_train_tfidf, y_train)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(svc.score(X_train_tfidf, y_train)*100, svc.score(X_test_tfidf, y_test)*100))

Training accuracy: 92.23%;  Test accuracy: 82.66%


#### Random Forest

In [20]:
rf = RandomForestClassifier(random_state=42, criterion='gini', max_depth=80, min_samples_split=6, n_estimators= 400)
rf.fit(X_train_tfidf, y_train)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(rf.score(X_train_tfidf, y_train)*100, rf.score(X_test_tfidf, y_test)*100))

Training accuracy: 98.41%;  Test accuracy: 77.83%


## Ensemble

In [21]:
from sklearn.ensemble import VotingClassifier
voter = VotingClassifier(estimators=[('nb', nb), ('lr', lr), ('svc', svc), ('rf', rf)], voting='hard')

In [22]:
voter.fit(X_train_tfidf, y_train)

VotingClassifier(estimators=[('nb', MultinomialNB()),
                             ('lr',
                              LogisticRegression(C=5.99, max_iter=10000,
                                                 multi_class='ovr',
                                                 random_state=42)),
                             ('svc',
                              LinearSVC(C=0.46, dual=False, max_iter=10000,
                                        random_state=42)),
                             ('rf',
                              RandomForestClassifier(max_depth=80,
                                                     min_samples_split=6,
                                                     n_estimators=400,
                                                     random_state=42))])

In [23]:
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(voter.score(X_train_tfidf, y_train)*100, voter.score(X_test_tfidf, y_test)*100))

Training accuracy: 92.62%;  Test accuracy: 81.06%


***
## Full data optimization with unbalanced data set

In [24]:
df_train = pd.read_csv('../Datasets/dataset_categories/dataset_categories_train.csv')
df_test = pd.read_csv('../Datasets/dataset_categories/dataset_categories_test.csv')
df_train.shape, df_test.shape

((131549, 8), (3500, 8))

In [25]:
# Sample weights for training
cat_weight = 1/df_train.category.value_counts()*df_train.category.value_counts()[0]
sample_weight = [cat_weight[i] for i in df_train.category]
cat_weight

travel           1.000000
financial        1.000000
sports           1.000000
technology       1.000000
world            1.000000
entertainment    1.024119
politics         1.662980
Name: category, dtype: float64

In [26]:
from sklearn.model_selection import train_test_split

X_train = df_train.text_lem
y_train = df_train.category
X_test = df_test.text_lem
y_test = df_test.category

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

#### Naive Bayes

In [27]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train, sample_weight=sample_weight)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(nb.score(X_train_tfidf, y_train)*100, nb.score(X_test_tfidf, y_test)*100))

Training accuracy: 74.74%;  Test accuracy: 74.83%


#### Logistic Regression

In [28]:
lr = LogisticRegression(max_iter=10000, random_state=42, C=5.99, multi_class= 'ovr', penalty='l2')
lr.fit(X_train_tfidf, y_train, sample_weight=sample_weight)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(lr.score(X_train_tfidf, y_train)*100, lr.score(X_test_tfidf, y_test)*100))

Training accuracy: 92.04%;  Test accuracy: 83.20%


#### LinearSVC

In [29]:
svc = LinearSVC(max_iter=10000, random_state=42, C=0.46, dual=False, loss='squared_hinge', penalty='l2')
svc.fit(X_train_tfidf, y_train, sample_weight = sample_weight)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(svc.score(X_train_tfidf, y_train)*100, svc.score(X_test_tfidf, y_test)*100))

Training accuracy: 91.72%;  Test accuracy: 83.43%


#### Random Forest

In [30]:
rf = RandomForestClassifier(random_state=42, criterion='gini', max_depth=80, min_samples_split=6, n_estimators= 400)
rf.fit(X_train_tfidf, y_train, sample_weight = sample_weight)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(rf.score(X_train_tfidf, y_train)*100, rf.score(X_test_tfidf, y_test)*100))

Training accuracy: 97.95%;  Test accuracy: 77.83%


***
## Save final model
Save final model to use it on streamlit. Save the tfidf vectorizer and the svc model as pkl files. Write a class to load both models and use them to categorize final text.

### Use smaller model, else size of files too big for streamlit

In [37]:
# TfidfVectorizer generates bag of words
from sklearn.feature_extraction.text import TfidfVectorizer

# sublinear_tf: use logarithmic form for frequency
# min_df: minimum numbers of documents a word must be present to keep it
# ngram_range: number of ngrams to use
# stopwords: remove all common pronouns in given language

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1',
                        ngram_range=(1, 3), max_features=40000)

In [38]:
df_train = load_stratified_dataset(path='../Datasets/dataset_categories/dataset_categories_train.csv', labels='category', samples_per_label=100000, random_seed=42)
df_test = pd.read_csv('../Datasets/dataset_categories/dataset_categories_test.csv')
df_train.shape, df_test.shape

Smallest sample size in dataset is 12026 samples!


((84181, 8), (3500, 8))

In [39]:
from sklearn.model_selection import train_test_split

X_train = df_train.text_lem
X_test = df_test.text_lem
y_train = df_train.category
y_test = df_test.category

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [40]:
svc = LinearSVC(max_iter=10000, random_state=42, C=0.46, dual=False, loss='squared_hinge', penalty='l2')
svc.fit(X_train_tfidf, y_train)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(svc.score(X_train_tfidf, y_train)*100, svc.score(X_test_tfidf, y_test)*100))

Training accuracy: 92.23%;  Test accuracy: 82.66%


In [41]:
len(tfidf.vocabulary_)

40000

In [42]:
#delattr(tfidf, 'stop_words')
delattr(tfidf, 'stop_words_')

In [43]:
pickle.dump(tfidf, open("tfidf.pkl", "wb"), protocol=pickle.HIGHEST_PROTOCOL) # 378M

In [44]:
!du -sh tfidf.pkl

1.9M	tfidf.pkl


In [45]:
pickle.dump(svc, open("svc.pkl", "wb"), protocol=pickle.HIGHEST_PROTOCOL)

In [46]:
!du -sh svc.pkl

2.1M	svc.pkl


In [102]:
def get_imp_feat(vectorizer, clf, label, X):
    # Get words that the features represent
    feature_names = vectorizer.get_feature_names()
    # For given label
    idx = np.where(svc.classes_ == label)[0][0]

    # Sort coefficients, get their arguments
    sort_by_importance = np.argsort(clf.coef_[idx])[::-1]
    
    # Find first 5 most important words in X
    most_imp = []
    for arg in sort_by_importance:
        word = feature_names[arg]
        if word in X:
            most_imp.append(word)
        if len(most_imp) > 4:
            break
    
    return most_imp

In [107]:
get_imp_feat(tfidf, svc, 'world', X_test[0])

['say', 'pti', 'world', 'relate', 'ad']

In [47]:
class Categorizer():
    def __init__(self):
        self.tfidf = pickle.load(open('tfidf.pkl', 'rb'))
        #self.lemmatizer = lemmatizer
        self.svc = pickle.load(open('svc.pkl', 'rb'))
    def preprocess(self, X):
        
        # Check if X is string, turn to list
        if type(X) == str:
            X = [X]
                    
        # Lemmatization
        #X_lem = [self.lemmatizer.lem_text(x) for x in X]
                
        # Tfidf vectorization
        X_tfidf = self.tfidf.transform(X)
        
        return X_tfidf
    
    def pred(self, X):
        
        # preprocess
        X_tfidf = self.preprocess(X)
        
        # return categories
        return self.svc.predict(X_tfidf)

In [48]:
categorizer = Categorizer()

In [49]:
pred = categorizer.pred(X_test)

In [50]:
(pred == y_test).sum()/y_test.shape[0]

0.8265714285714286

Final accuracy is correct, loading and using model works fine.