In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('forTraining.csv',index_col=0)
df.tail()

Unnamed: 0,topic_area,text
2495,science,coronavirus kill clinician trace ferocious ram...
2496,science,sniprs take aim diseaserelated mutation scienc...
2497,science,daily briefing protein surface coronavirus mig...
2498,science,sewage could reveal true scale coronavirus out...
2499,science,study claiming new coronavirus transmitted peo...


In [3]:
y = pd.DataFrame(df['topic_area'])
X = pd.DataFrame(df['text'])

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])

In [6]:
import numpy as np
text_clf.fit(X_train['text'], y_train['topic_area'])
predicted = text_clf.predict(X_test['text'])
np.mean(predicted == y_test['topic_area'])

0.726

In [7]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

    business       0.86      0.55      0.67        98
     finance       0.61      0.68      0.65       103
     general       0.73      0.76      0.75       100
     science       0.69      0.99      0.82        96
        tech       0.83      0.66      0.74       103

    accuracy                           0.73       500
   macro avg       0.74      0.73      0.72       500
weighted avg       0.74      0.73      0.72       500



In [8]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, random_state=42,
max_iter=5, tol=None)),
])

text_clf_svm.fit(X_train['text'], y_train['topic_area'])
predicted = text_clf_svm.predict(X_test['text'])
np.mean(predicted == y_test['topic_area'])

0.814

In [9]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

    business       0.78      0.85      0.81        98
     finance       0.78      0.67      0.72       103
     general       0.90      0.81      0.85       100
     science       0.86      0.99      0.92        96
        tech       0.76      0.77      0.76       103

    accuracy                           0.81       500
   macro avg       0.81      0.82      0.81       500
weighted avg       0.81      0.81      0.81       500



In [10]:
from sklearn.model_selection import GridSearchCV
parameters = {
'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf__alpha': (1e-2, 1e-3),
}

In [11]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train['text'], y_train['topic_area'])



In [12]:
print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.7865
{'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}


In [13]:
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf-svm__alpha': (1e-2, 1e-3),
}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train['text'], y_train['topic_area'])
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)



0.8385
{'clf-svm__alpha': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
