In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
df = pd.read_csv('forModelTraining.csv',index_col=0)
df.tail()

Unnamed: 0,category,text
5995,ENVIRONMENT,kemosabe chatty porcupine eats banana video ke...
5996,ENVIRONMENT,bear camera capture life eye alaskan animal vi...
5997,ENVIRONMENT,first day fall autumnal equinox arrives septem...
5998,ENVIRONMENT,climate change could turn greenland well green...
5999,ENVIRONMENT,diy tshirt project idea reusing old tee networ...


In [3]:
y_train = pd.DataFrame(df['category'])
X_train = pd.DataFrame(df['text'])

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])

In [5]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, random_state=42,
max_iter=5, tol=None)),
])

In [6]:
from sklearn.neural_network import MLPClassifier

text_clf_nn = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf-nn', MLPClassifier(solver='lbfgs', random_state=0)),
])

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf__alpha': (1e-2, 1e-3),
}

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))

In [None]:
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf-svm__alpha': (1e-2, 1e-3),
}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

In [7]:
df2 = pd.read_csv('forEvaluating_v2.csv',index_col=0)
df2.tail()

Unnamed: 0,category,predicted_category,text
995,GENERAL,995,japan expanding state emergency entire nation ...
996,GENERAL,996,south african president extends lockdown two w...
997,GENERAL,997,uk report largest oneday death toll coronaviru...
998,POLITICS,998,hill campaign report biden seek counter trump ...
999,GENERAL,999,nigeria issue warning drug touted trump overdo...


In [8]:
y = pd.DataFrame(df2['category'])
X = pd.DataFrame(df2['text'])

In [9]:
text_clf.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
predicted = text_clf.predict((X['text'].values.astype(str)))
np.mean(predicted == y['category'].values.astype(str))

0.533

In [10]:
from sklearn import metrics
print(metrics.classification_report(y['category'].values.astype(str), predicted))

              precision    recall  f1-score   support

    BUSINESS       0.39      0.64      0.48        99
 ENVIRONMENT       0.28      0.74      0.40        46
     GENERAL       0.89      0.41      0.56       574
    POLITICS       0.39      0.86      0.53       116
     SCIENCE       0.67      0.41      0.51        54
        TECH       0.49      0.71      0.58       111

    accuracy                           0.53      1000
   macro avg       0.52      0.63      0.51      1000
weighted avg       0.70      0.53      0.54      1000



In [11]:
text_clf_svm.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
predicted = text_clf_svm.predict((X['text'].values.astype(str)))
np.mean(predicted == y['category'].values.astype(str))

0.576

In [12]:
from sklearn import metrics
print(metrics.classification_report(y['category'].values.astype(str), predicted))

              precision    recall  f1-score   support

    BUSINESS       0.43      0.68      0.52        99
 ENVIRONMENT       0.42      0.78      0.55        46
     GENERAL       0.93      0.44      0.59       574
    POLITICS       0.39      0.93      0.55       116
     SCIENCE       0.56      0.56      0.56        54
        TECH       0.54      0.77      0.64       111

    accuracy                           0.58      1000
   macro avg       0.54      0.69      0.57      1000
weighted avg       0.73      0.58      0.58      1000



In [13]:
text_clf_nn.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
predicted = text_clf_nn.predict((X['text'].values.astype(str)))
np.mean(predicted == y['category'].values.astype(str))

0.493

In [14]:
from sklearn import metrics
print(metrics.classification_report(y['category'].values.astype(str), predicted))

              precision    recall  f1-score   support

    BUSINESS       0.40      0.65      0.49        99
 ENVIRONMENT       0.36      0.65      0.46        46
     GENERAL       0.90      0.36      0.52       574
    POLITICS       0.31      0.88      0.45       116
     SCIENCE       0.30      0.43      0.35        54
        TECH       0.58      0.59      0.59       111

    accuracy                           0.49      1000
   macro avg       0.47      0.59      0.48      1000
weighted avg       0.69      0.49      0.50      1000



In [None]:
df3 = pd.read_json('forPredicting.json')
df3.tail()

In [None]:
X = pd.DataFrame(df3['text'])

In [None]:
text_clf_svm.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
predicted = text_clf_svm.predict((X['text'].values.astype(str)))

In [None]:
start_time = datetime.now()
text_clf_svm.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
print('Time taken to train model: ' + str(datetime.now() - start_time) + ' seconds')

In [None]:
df4 = pd.read_json('Combined.json')
df4['predicted_category'] = df4['source']
df4.tail()

In [None]:
start_time = datetime.now()
predicted = text_clf_svm.predict((X['text'].values.astype(str)))
end_time = datetime.now()
time_taken = end_time - start_time
time_taken = (time_taken.microseconds/1000000)+(time_taken).seconds
records_classified = round(1/time_taken*int(df4['predicted_category'].count()))
print('Time taken to classify: ' + str(time_taken) + 'seconds')
print('Records classified per second: ' + str(records_classified))

In [None]:
for i, row in df4.iterrows():
    df4.at[i,'predicted_category'] = predicted[i]

In [None]:
df4.tail()

In [None]:
df4.count()

In [None]:
df4.to_json("Combined Predicted.json")