In [1]:
import pandas as pd
from utils import CustomPreprocessor

In [2]:
df = pd.read_csv('../../raw_data/labeled_sentences_merged_imbalanced_12k_extra_classes.csv',usecols=['sentence','topic_label','sentiment_label'])
df.head(1)

Unnamed: 0,sentence,topic_label,sentiment_label
0,Fallout from the scandal could lead to a lost ...,Governance,Negative


In [3]:
X = df[['sentence']].astype(str)
y_topic = df.loc[:,'topic_label'].values
y_sentiment = df.loc[:,'sentiment_label'].values

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', CustomPreprocessor()),
    ('vectorizer', TfidfVectorizer()),
    ('nb',MultinomialNB())
])
MultinomialNB().get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True, 'force_alpha': 'warn'}

In [5]:
from sklearn.model_selection import train_test_split
X_topic_train, X_topic_test, y_topic_train, y_topic_test = train_test_split(X,y_topic,test_size=0.25,random_state=42)
X_sentiment_train, X_sentiment_test, y_sentiment_train, y_sentiment_test = train_test_split(X,y_sentiment,test_size=0.25,random_state=42)


In [7]:
from sklearn.metrics import f1_score,recall_score,precision_score,accuracy_score,make_scorer
# Define the parameter grid
param_grid = {
    'preprocessor__accents':['keep','remove'],
    'preprocessor__html':['keep','remove'],
    'preprocessor__negation':['keep','remove'],
    'preprocessor__numbers':['keep','remove'],
    'preprocessor__punctuation':['keep','remove'],
    'preprocessor__remove_stopwords':[True,False],
    'preprocessor__stem':[True,False],
    'preprocessor__lemma':[True,False],
    'vectorizer__max_df':[1.0,0.9,0.8,0.7,0.6,0.5],
    'vectorizer__min_df':[1,5,10,100,1000],
    'vectorizer__ngram_range':[(1,1),(1,2),(1,3)],
    'nb__alpha':[1.0,0.9,0.7,0.5]
    
}

accuracy_scorer = make_scorer(accuracy_score)
precision_scorer = make_scorer(precision_score)
recall_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)
# Define the grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1,scoring=f1_scorer,verbose=3)

# Fit the grid search to the data
grid_search.fit(X_topic_train, y_topic_train)


Fitting 5 folds for each of 92160 candidates, totalling 460800 fits
[CV 1/5] END nb__alpha=1.0, preprocessor__accents=keep, preprocessor__html=keep, preprocessor__lemma=True, preprocessor__negation=keep, preprocessor__numbers=keep, preprocessor__punctuation=keep, preprocessor__remove_stopwords=True, preprocessor__stem=True, vectorizer__max_df=1.0, vectorizer__min_df=1, vectorizer__ngram_range=(1, 1);, score=nan total time=   0.0s
[CV 3/5] END nb__alpha=1.0, preprocessor__accents=keep, preprocessor__html=keep, preprocessor__lemma=True, preprocessor__negation=keep, preprocessor__numbers=keep, preprocessor__punctuation=keep, preprocessor__remove_stopwords=True, preprocessor__stem=True, vectorizer__max_df=1.0, vectorizer__min_df=1, vectorizer__ngram_range=(1, 1);, score=nan total time=   0.0s
[CV 5/5] END nb__alpha=1.0, preprocessor__accents=keep, preprocessor__html=keep, preprocessor__lemma=True, preprocessor__negation=keep, preprocessor__numbers=keep, preprocessor__punctuation=keep, prep

In [None]:
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)