In [2]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
from random import randint
import numpy as np
import pandas as pd

# sklearn for feature extraction & modeling
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.model_selection import train_test_split

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

sns.set_style('white')

#sklearn for modelling
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import GridSearchCV

In [3]:
os.chdir('../')

In [4]:
docs = pd.read_csv('data/02_intermediate/preprocessed_docs_df.csv')
docs.head()

Unnamed: 0,category,heading,article,category_id
0,Business,Ad sales boost Time Warner profit,quarter profit us media giant timewarn jump 76...,0
1,Business,Dollar gains on Greenspan speech,dollar hit highest level euro almost three mon...,0
2,Business,Yukos unit buyer faces loan claim,owner embattl russian oil giant yuko ask buyer...,0
3,Business,High fuel prices hit BA's profits,british airway blame high fuel price 40 drop p...,0
4,Business,Pernod takeover talk lifts Domecq,share uk drink food firm alli domecq risen spe...,0


In [5]:
data_df = pd.read_csv('data/02_intermediate/preprocessed_tf_idf_articles.csv')
data_df = data_df.drop(columns =['Unnamed: 0'])
# data_df.head()
data_df['output'] = docs['category_id']

In [6]:
features = data_df.columns.tolist()
output = 'output'
# removing the output and the id from features
features.remove(output)

In [7]:
alpha = np.linspace(0.006, 0.1, 20)
alpha = np.round(alpha, decimals=4)
alpha

array([0.006 , 0.0109, 0.0159, 0.0208, 0.0258, 0.0307, 0.0357, 0.0406,
       0.0456, 0.0505, 0.0555, 0.0604, 0.0654, 0.0703, 0.0753, 0.0802,
       0.0852, 0.0901, 0.0951, 0.1   ])

In [8]:
#Grid Search CV
parameter_grid = [{'alpha':alpha}]

In [9]:
# classifier object
classifier1 = MultinomialNB()
# gridsearch object using 4 fold cross validation and neg_log_loss as scoring paramter
gridsearch1 = GridSearchCV(classifier1,parameter_grid, scoring = 'neg_log_loss', cv = 4)
# fit the gridsearch
gridsearch1.fit(data_df[features], data_df[output])

GridSearchCV(cv=4, estimator=MultinomialNB(),
             param_grid=[{'alpha': array([0.006 , 0.0109, 0.0159, 0.0208, 0.0258, 0.0307, 0.0357, 0.0406,
       0.0456, 0.0505, 0.0555, 0.0604, 0.0654, 0.0703, 0.0753, 0.0802,
       0.0852, 0.0901, 0.0951, 0.1   ])}],
             scoring='neg_log_loss')

In [10]:
# get results from gridsearch
results1 = pd.DataFrame()
# collect alpha list
results1['alpha'] = gridsearch1.cv_results_['param_alpha'].data
# collect test scores
results1['neglogloss'] = gridsearch1.cv_results_['mean_test_score'].data

In [11]:
import plotly.express as px
px.scatter(results1, 'alpha', 'neglogloss')

In [12]:
print("Best parameter: ",gridsearch1.best_params_)

Best parameter:  {'alpha': 0.0258}


In [13]:
# Tuning multinomial bayes classifier
alpha_list2 = np.linspace(0.006, 0.1, 20)
alpha_list2 = np.around(alpha_list2, decimals=4)
alpha_list2

array([0.006 , 0.0109, 0.0159, 0.0208, 0.0258, 0.0307, 0.0357, 0.0406,
       0.0456, 0.0505, 0.0555, 0.0604, 0.0654, 0.0703, 0.0753, 0.0802,
       0.0852, 0.0901, 0.0951, 0.1   ])

In [14]:
parameter_grid = [{"alpha":alpha_list2}]

In [15]:
# classifier object
classifier2 = MultinomialNB()
# gridsearch object using 4 fold cross validation and neg_log_loss as scoring paramter
gridsearch2 = GridSearchCV(classifier2,parameter_grid, scoring = 'neg_log_loss', cv = 4)
# fit the gridsearch
gridsearch2.fit(data_df[features], data_df[output])

GridSearchCV(cv=4, estimator=MultinomialNB(),
             param_grid=[{'alpha': array([0.006 , 0.0109, 0.0159, 0.0208, 0.0258, 0.0307, 0.0357, 0.0406,
       0.0456, 0.0505, 0.0555, 0.0604, 0.0654, 0.0703, 0.0753, 0.0802,
       0.0852, 0.0901, 0.0951, 0.1   ])}],
             scoring='neg_log_loss')