In [1]:
import pandas as pd
from utils import CustomPreprocessor, ModelSelector

In [2]:
import nltk
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
df_sentences = pd.read_csv('../../raw_data/labeled_sentences_merged_imbalanced_12k_extra_classes.csv',usecols=['sentence','topic_label','sentiment_label'])
print(f'Before dropping duplicates {len(df_sentences)}')
df_sentences.drop_duplicates(inplace=True)
print(f'After dropping duplicates {len(df_sentences)}')


Before dropping duplicates 12139
After dropping duplicates 8573


In [5]:
df_articles = pd.read_csv('../../raw_data/volkswagon_news_text.csv',usecols=['full_text'])
print(f'Before dropping duplicates {len(df_articles)}')
df_articles.drop_duplicates(inplace=True)
print(f'After dropping duplicates {len(df_articles)}')


Before dropping duplicates 4525
After dropping duplicates 3115


In [6]:
df_sentences

Unnamed: 0,sentence,topic_label,sentiment_label
0,Fallout from the scandal could lead to a lost ...,Governance,Negative
1,The damning parliamentary report into the demi...,Social,Negative
2,The BHS scandal has been described by MPs as t...,Environmental,Negative
3,"Dominic Chappell, the businessman who bought B...",Social,Neutral
4,The fallout from the scandal could lead to a k...,Governance,Negative
...,...,...,...
12133,Seoul-based Daol Investment & Securities analy...,,Positive
12134,"""It may take a few years, but eventually the l...",Environmental,Positive
12135,(Reporting by Heekyong Yang in Seoul and Ben K...,,Neutral
12136,Click For Restrictions - https://agency.reuter...,,Neutral


In [7]:
X = df_sentences[['sentence']].astype(str)
y_topic = df_sentences.loc[:,'topic_label'].values
y_sentiment = df_sentences.loc[:,'sentiment_label'].values

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', CustomPreprocessor()),
    ('vectorizer', TfidfVectorizer()),
    ('model',ModelSelector())
])
# ModelSelector().get_params()

In [9]:
from sklearn.model_selection import train_test_split
X_topic_train, X_topic_test, y_topic_train, y_topic_test = train_test_split(X,y_topic,test_size=0.25,random_state=42)
X_sentiment_train, X_sentiment_test, y_sentiment_train, y_sentiment_test = train_test_split(X,y_sentiment,test_size=0.25,random_state=42)


In [10]:
from sklearn.metrics import f1_score,recall_score,precision_score,accuracy_score,make_scorer
# Define the parameter grid
param_grid = {
    # 'preprocessor__accents':['keep','remove'],
    # 'preprocessor__html':['keep','remove'],
    # 'preprocessor__negation':['keep','remove'],
    # 'preprocessor__numbers':['keep','remove'],
    # 'preprocessor__punctuation':['keep','remove'],
    # 'preprocessor__remove_stopwords':[True,False],
    # 'preprocessor__stem':[True,False],
    # 'preprocessor__lemma':[True,False],
    # 'vectorizer__max_df':[1.0,0.8],
    # 'vectorizer__min_df':[1,10],
    # 'vectorizer__ngram_range':[(1,1),(1,2)],
    # 'nb__alpha':[1.0,0.9,0.7,0.5]
    'model__model' : ['NB','SVM','RF','DT','Reg']
    
}

# accuracy_scorer = make_scorer(accuracy_score)
# precision_scorer = make_scorer(precision_score)
# recall_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)
# Define the grid search
topic_grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1,scoring='f1_weighted',verbose=3,error_score='raise').fit(X_topic_train, y_topic_train)
# sentiment_grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1,scoring='f1_weighted',verbose=2,error_score='raise').fit(X_sentiment_train, y_sentiment_train)



Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END ...................model__model=NB;, score=0.608 total time=  10.6s
[CV 4/5] END ...................model__model=NB;, score=0.619 total time=  10.6s
[CV 5/5] END ...................model__model=NB;, score=0.618 total time=   9.3s
[CV 2/5] END ..................model__model=SVM;, score=0.739 total time=  10.8s
[CV 3/5] END ..................model__model=SVM;, score=0.760 total time=  10.5s
[CV 2/5] END ...................model__model=NB;, score=0.636 total time=   9.5s
[CV 3/5] END ...................model__model=NB;, score=0.623 total time=  10.0s
[CV 1/5] END ..................model__model=SVM;, score=0.724 total time=   9.8s
[CV 1/5] END ...................model__model=DT;, score=0.670 total time=  12.3s
[CV 4/5] END ..................model__model=SVM;, score=0.733 total time=  13.9s
[CV 5/5] END ..................model__model=SVM;, score=0.736 total time=  14.4s
[CV 1/5] END ...................model__model=RF;,

In [11]:
df_cv_results = pd.DataFrame(topic_grid.cv_results_)


In [12]:
df_cv_results.sort_values(by='rank_test_score', ascending=True).head(40)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__model,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,9.353714,1.198838,2.517534,0.719666,SVM,{'model__model': 'SVM'},0.724249,0.738953,0.760413,0.73271,0.736469,0.738559,0.012009,1
4,9.493708,0.58719,1.716945,0.320112,Reg,{'model__model': 'Reg'},0.715037,0.725852,0.729443,0.718914,0.71253,0.720355,0.006394,2
2,20.24673,4.138729,2.826756,0.536475,RF,{'model__model': 'RF'},0.71669,0.695786,0.709486,0.712817,0.713305,0.709617,0.007282,3
3,12.790335,2.34671,2.500074,0.387006,DT,{'model__model': 'DT'},0.66977,0.659194,0.63588,0.658567,0.654943,0.655671,0.011062,4
0,8.08233,0.533291,1.906039,0.052334,NB,{'model__model': 'NB'},0.608113,0.636492,0.623256,0.618817,0.617804,0.620896,0.009231,5


In [13]:
topic_grid.best_params_

{'model__model': 'SVM'}