# Hyper parameter optimization
Use Grid search to find best parameters for models. Use full dataset to train final models.

### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from plots_fabi import *
import pickle
import sys
# Add functions path
sys.path.append('../../Functions')
from time import time

### Load Dataset

In [2]:
from datasets import load_stratified_dataset
df = load_stratified_dataset(path='../../Datasets/dataset_categories/dataset_big.csv', labels='category', samples_per_label=1000, random_seed=11)

In [3]:
# TfidfVectorizer generates bag of words
from sklearn.feature_extraction.text import TfidfVectorizer

# sublinear_tf: use logarithmic form for frequency
# min_df: minimum numbers of documents a word must be present to keep it
# ngram_range: number of ngrams to use
# stopwords: remove all common pronouns in given language

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1',
                        ngram_range=(1, 3), stop_words=None, max_features=40000)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text_lem'], df['category'], random_state = 42)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

***
# GridSearch
Search for best hyperparameters.

In [5]:
from sklearn.model_selection import GridSearchCV

### Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
parameters = {'penalty': ['l1', 'l2', 'elasticnet'], 'C': np.logspace(-1,1,10), 'multi_class': ['auto', 'ovr', 'multinomial']}
clf = GridSearchCV(LogisticRegression(max_iter = 10000, random_state=42), parameters, n_jobs=4, cv=5)
clf.fit(X_train_tfidf, y_train);
clf.best_params_

{'C': 10.0, 'multi_class': 'ovr', 'penalty': 'l2'}

In [8]:
clf.score(X_train_tfidf, y_train), clf.score(X_test_tfidf, y_test)

(0.989904761904762, 0.7342857142857143)

### LinearSVC

In [9]:
from sklearn.svm import LinearSVC

In [10]:
parameters = {'penalty': ['l1', 'l2'], 'C': np.logspace(-1,1,10), 'dual': [False, True], 'loss': ['hinge', 'squared_hinge']}
clf = GridSearchCV(LinearSVC(max_iter = 10000, random_state=42), parameters, n_jobs=4, cv=5)
clf.fit(X_train_tfidf, y_train);
clf.best_params_

{'C': 0.774263682681127,
 'dual': False,
 'loss': 'squared_hinge',
 'penalty': 'l2'}

In [11]:
clf.score(X_train_tfidf, y_train), clf.score(X_test_tfidf, y_test)

(0.9891428571428571, 0.7308571428571429)

### Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

***
For __all models__ grid search does __not__ really seem to __improve__ accuracy. Probably since errors occur due to the dataset given. Further grid analysis is therefor not necessary.

***
## Full data optimization

In [14]:
df_train = load_stratified_dataset(path='../../Datasets/dataset_categories/dataset_categories_train.csv', labels='category', samples_per_label=99000, random_seed=42)
df_test = pd.read_csv('../../Datasets/dataset_categories/dataset_categories_test.csv')
df_train.shape, df_test.shape

Smallest sample size in dataset is 12026 samples!


((84181, 8), (3500, 8))

In [15]:
from sklearn.model_selection import train_test_split

X_train = df_train.text_lem
X_test = df_test.text_lem
y_train = df_train.category
y_test = df_test.category

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

#### Naive Bayes

In [16]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(nb.score(X_train_tfidf, y_train)*100, nb.score(X_test_tfidf, y_test)*100))

Training accuracy: 74.37%;  Test accuracy: 74.94%


#### Logistic Regression

In [17]:
lr = LogisticRegression(max_iter=10000, random_state=42, C=5.99, multi_class= 'ovr', penalty='l2')
lr.fit(X_train_tfidf, y_train)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(lr.score(X_train_tfidf, y_train)*100, lr.score(X_test_tfidf, y_test)*100))

Training accuracy: 92.54%;  Test accuracy: 82.94%


#### LinearSVC

In [18]:
svc = LinearSVC(max_iter=10000, random_state=42, C=0.46, dual=False, loss='squared_hinge', penalty='l2')
svc.fit(X_train_tfidf, y_train)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(svc.score(X_train_tfidf, y_train)*100, svc.score(X_test_tfidf, y_test)*100))

Training accuracy: 92.29%;  Test accuracy: 83.06%


#### Random Forest

## Ensemble

***
## Full data optimization with unbalanced data set

In [19]:
df_train = pd.read_csv('../../Datasets/dataset_categories/dataset_categories_train.csv')
df_test = pd.read_csv('../../Datasets/dataset_categories/dataset_categories_test.csv')
df_train.shape, df_test.shape

((131549, 8), (3500, 8))

In [20]:
# Sample weights for training
cat_weight = 1/df_train.category.value_counts()*df_train.category.value_counts()[0]
sample_weight = [cat_weight[i] for i in df_train.category]
cat_weight

travel           1.000000
world            1.000000
sports           1.000000
financial        1.000000
technology       1.000000
entertainment    1.024119
politics         1.662980
Name: category, dtype: float64

In [21]:
from sklearn.model_selection import train_test_split

X_train = df_train.text_lem
y_train = df_train.category
X_test = df_test.text_lem
y_test = df_test.category

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

#### Naive Bayes

In [22]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train, sample_weight=sample_weight)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(nb.score(X_train_tfidf, y_train)*100, nb.score(X_test_tfidf, y_test)*100))

Training accuracy: 74.83%;  Test accuracy: 74.83%


#### Logistic Regression

In [23]:
lr = LogisticRegression(max_iter=10000, random_state=42, C=5.99, multi_class= 'ovr', penalty='l2')
lr.fit(X_train_tfidf, y_train, sample_weight=sample_weight)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(lr.score(X_train_tfidf, y_train)*100, lr.score(X_test_tfidf, y_test)*100))

Training accuracy: 92.04%;  Test accuracy: 83.03%


#### LinearSVC

In [24]:
svc = LinearSVC(max_iter=10000, random_state=42, C=0.46, dual=False, loss='squared_hinge', penalty='l2')
svc.fit(X_train_tfidf, y_train, sample_weight = sample_weight)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(svc.score(X_train_tfidf, y_train)*100, svc.score(X_test_tfidf, y_test)*100))

Training accuracy: 91.72%;  Test accuracy: 83.43%


#### Random Forest

In [None]:
def print_top10(vectorizer, clf, class_labels):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-10:]
        print("%s: %s" % (class_label,
              " ".join(feature_names[j] for j in top10)))

In [None]:
def get_imp_feat(vectorizer, clf, X):
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-10:]
        print("%s: %s" % (class_label,
              " ".join(feature_names[j] for j in top10)))

In [27]:
svc.coef_.shape

(7, 40000)

In [44]:
len(set([word for word in X_test[0].split(' ') if word in tfidf.vocabulary_]))

204

In [45]:
len(set(X_test[0].split(' ')))

219

In [46]:
tfidf.get_feature_names()

['00',
 '00 00',
 '00 car',
 '00 car sale',
 '00 cet',
 '00 company',
 '00 company mention',
 '00 eastern',
 '00 edt',
 '00 et',
 '00 local',
 '00 local real',
 '00 pm',
 '00 pm car',
 '00 pm edt',
 '00 pm seanna',
 '00 seanna',
 '00 seanna cronin',
 '000',
 '000 crore',
 '000 square',
 '000 square foot',
 '00am',
 '00p',
 '00pm',
 '01',
 '01 00',
 '01 15',
 '01 ed',
 '01 ed symkusmore',
 '01 pm',
 '01 update',
 '01am',
 '01am october',
 '02',
 '02 15',
 '02 pm',
 '03',
 '03 pm',
 '04',
 '04 pm',
 '05',
 '05 15',
 '05 pm',
 '06',
 '06 cet',
 '06 pm',
 '07',
 '07 15',
 '07 2015',
 '07 pm',
 '08',
 '08 15',
 '08 2015',
 '08 pm',
 '08 pm update',
 '08 update',
 '09',
 '09 15',
 '09 2015',
 '09 30',
 '09 pm',
 '09 pm update',
 '10',
 '10 00',
 '10 00 pm',
 '10 000',
 '10 01',
 '10 02',
 '10 03',
 '10 05',
 '10 07',
 '10 08',
 '10 08 update',
 '10 09',
 '10 10',
 '10 11',
 '10 12',
 '10 12 2015',
 '10 13',
 '10 14',
 '10 15',
 '10 16',
 '10 16 15',
 '10 17',
 '10 18',
 '10 19',
 '10 19 15',

***
## Save final model
Save final model to use it on streamlit. Save the tfidf vectorizer and the svc model as pkl files. Write a class to load both models and use them to categorize final text.

### Use smaller model, else size of files too big for streamlit

In [None]:
# TfidfVectorizer generates bag of words
from sklearn.feature_extraction.text import TfidfVectorizer

# sublinear_tf: use logarithmic form for frequency
# min_df: minimum numbers of documents a word must be present to keep it
# ngram_range: number of ngrams to use
# stopwords: remove all common pronouns in given language

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1',
                        ngram_range=(1, 3), max_features=40000)

In [None]:
df_train = load_stratified_dataset(path='../../Datasets/dataset_categories/dataset_categories_train.csv', labels='category', samples_per_label=100000, random_seed=42)
df_test = pd.read_csv('../../Datasets/dataset_categories/dataset_categories_test.csv')
df_train.shape, df_test.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train = df_train.text_lem
X_test = df_test.text_lem
y_train = df_train.category
y_test = df_test.category

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
svc = LinearSVC(max_iter=10000, random_state=42, C=0.46, dual=False, loss='squared_hinge', penalty='l2')
svc.fit(X_train_tfidf, y_train)
print('Training accuracy: {:.2f}%;  Test accuracy: {:.2f}%'.format(svc.score(X_train_tfidf, y_train)*100, svc.score(X_test_tfidf, y_test)*100))

In [None]:
len(tfidf.vocabulary_)

In [None]:
#delattr(tfidf, 'stop_words')
delattr(tfidf, 'stop_words_')

In [None]:
pickle.dump(tfidf, open("tfidf.pkl", "wb"), protocol=pickle.HIGHEST_PROTOCOL) # 378M

In [None]:
!du -sh tfidf.pkl

In [None]:
pickle.dump(svc, open("svc.pkl", "wb"), protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
!du -sh svc.pkl

In [None]:
class Categorizer():
    def __init__(self):
        self.tfidf = pickle.load(open('tfidf.pkl', 'rb'))
        #self.lemmatizer = lemmatizer
        self.svc = pickle.load(open('svc.pkl', 'rb'))
    def preprocess(self, X):
        
        # Check if X is string, turn to list
        if type(X) == str:
            X = [X]
                    
        # Lemmatization
        #X_lem = [self.lemmatizer.lem_text(x) for x in X]
                
        # Tfidf vectorization
        X_tfidf = self.tfidf.transform(X)
        
        return X_tfidf
    
    def pred(self, X):
        
        # preprocess
        X_tfidf = self.preprocess(X)
        
        # return categories
        return self.svc.predict(X_tfidf)

In [None]:
categorizer = Categorizer()

In [None]:
pred = categorizer.pred(X_test)

In [None]:
(pred == y_test).sum()/y_test.shape[0]

Final accuracy is correct, loading and using model works fine.