In [1]:
import pandas as pd
from utils import CustomPreprocessor, ModelSelector

In [2]:
import nltk
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
df_sentences = pd.read_csv('../../raw_data/labeled_sentences_merged_imbalanced_12k_extra_classes.csv',usecols=['sentence','topic_label'])
print(f'Before dropping duplicates {len(df_sentences)}')
df_sentences.drop_duplicates(inplace=True)
print(f'After dropping duplicates {len(df_sentences)}')


Before dropping duplicates 12139
After dropping duplicates 8573


In [57]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', CustomPreprocessor()),
    ('vectorizer', TfidfVectorizer(max_df=0.9)),
    ('model',SGDClassifier(loss='hinge'))
])
SGDClassifier().get_params()

{'alpha': 0.0001,
 'average': False,
 'class_weight': None,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.0,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'optimal',
 'loss': 'hinge',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'n_jobs': None,
 'penalty': 'l2',
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [58]:
pipeline.get_params()

{'memory': None,
 'steps': [('preprocessor', CustomPreprocessor()),
  ('vectorizer', TfidfVectorizer(max_df=0.9)),
  ('model', SGDClassifier())],
 'verbose': False,
 'preprocessor': CustomPreprocessor(),
 'vectorizer': TfidfVectorizer(max_df=0.9),
 'model': SGDClassifier(),
 'preprocessor__accents': 'keep',
 'preprocessor__html': 'keep',
 'preprocessor__lemma': False,
 'preprocessor__negation': 'keep',
 'preprocessor__numbers': 'remove',
 'preprocessor__punctuation': 'remove',
 'preprocessor__remove_stopwords': True,
 'preprocessor__stem': True,
 'vectorizer__analyzer': 'word',
 'vectorizer__binary': False,
 'vectorizer__decode_error': 'strict',
 'vectorizer__dtype': numpy.float64,
 'vectorizer__encoding': 'utf-8',
 'vectorizer__input': 'content',
 'vectorizer__lowercase': True,
 'vectorizer__max_df': 0.9,
 'vectorizer__max_features': None,
 'vectorizer__min_df': 1,
 'vectorizer__ngram_range': (1, 1),
 'vectorizer__norm': 'l2',
 'vectorizer__preprocessor': None,
 'vectorizer__smooth_id

In [59]:
X = df_sentences[['sentence']].astype(str)
y_topic = df_sentences.loc[:,'topic_label'].values

In [60]:
from sklearn.model_selection import train_test_split
X_topic_train, X_topic_test, y_topic_train, y_topic_test = train_test_split(X,y_topic,test_size=0.25,random_state=42)


In [61]:
param_grid = {
    # 'model__loss' : ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron']
    
}

topic_grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1,scoring='f1_weighted',verbose=3,error_score='raise').fit(X_topic_train, y_topic_train)



Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 5/5] END ..................................., score=0.727 total time=  11.4s
[CV 4/5] END ..................................., score=0.737 total time=  11.4s
[CV 1/5] END ..................................., score=0.724 total time=  11.5s
[CV 3/5] END ..................................., score=0.757 total time=  11.6s
[CV 2/5] END ..................................., score=0.739 total time=  11.7s


In [62]:
topic_grid.best_score_

0.7370244591295634

In [63]:
topic_grid.best_params_

{}