In [86]:
import pandas as pd
from datetime import datetime

In [32]:
df = pd.read_csv('forModelTraining.csv',index_col=0)
df.tail()

Unnamed: 0,category,text
5995,ENVIRONMENT,kemosabe chatty porcupine eats banana video ke...
5996,ENVIRONMENT,bear camera capture life eye alaskan animal vi...
5997,ENVIRONMENT,first day fall autumnal equinox arrives septem...
5998,ENVIRONMENT,climate change could turn greenland well green...
5999,ENVIRONMENT,diy tshirt project idea reusing old tee networ...


In [33]:
y = pd.DataFrame(df['category'])
X = pd.DataFrame(df['text'])

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])

In [65]:
import numpy as np
text_clf.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
predicted = text_clf.predict(X_test['text'].values.astype(str))
np.mean(predicted == y_test['category'].values.astype(str))

0.7225

In [7]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

    BUSINESS       0.75      0.60      0.67       204
 ENVIRONMENT       0.69      0.85      0.76       180
     GENERAL       0.58      0.76      0.66       204
    POLITICS       0.75      0.69      0.72       207
     SCIENCE       0.81      0.54      0.65       203
        TECH       0.76      0.83      0.79       202

    accuracy                           0.71      1200
   macro avg       0.72      0.71      0.71      1200
weighted avg       0.72      0.71      0.71      1200



In [66]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, random_state=42,
max_iter=5, tol=None)),
])

text_clf_svm.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
predicted = text_clf_svm.predict(X_test['text'].values.astype(str))
np.mean(predicted == y_test['category'].values.astype(str))

0.6891666666666667

In [9]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

    BUSINESS       0.76      0.54      0.63       204
 ENVIRONMENT       0.73      0.79      0.76       180
     GENERAL       0.55      0.80      0.66       204
    POLITICS       0.76      0.70      0.73       207
     SCIENCE       0.73      0.58      0.65       203
        TECH       0.79      0.82      0.80       202

    accuracy                           0.70      1200
   macro avg       0.72      0.71      0.70      1200
weighted avg       0.72      0.70      0.70      1200



In [10]:
from sklearn.neural_network import MLPClassifier

text_clf_nn = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf-nn', MLPClassifier(solver='lbfgs', random_state=0)),
])


In [38]:
text_clf_nn.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
predicted = text_clf_nn.predict(X_test['text'].values.astype(str))
np.mean(predicted == y_test['category'].values.astype(str))

0.71

In [12]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

    BUSINESS       0.69      0.63      0.66       204
 ENVIRONMENT       0.71      0.77      0.74       180
     GENERAL       0.55      0.78      0.65       204
    POLITICS       0.74      0.65      0.69       207
     SCIENCE       0.74      0.63      0.68       203
        TECH       0.82      0.73      0.77       202

    accuracy                           0.69      1200
   macro avg       0.71      0.70      0.70      1200
weighted avg       0.71      0.69      0.70      1200



In [13]:
from sklearn.model_selection import GridSearchCV
parameters = {
'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf__alpha': (1e-2, 1e-3),
}

In [14]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))



In [15]:
print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.6697916666666667
{'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}


In [16]:
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf-svm__alpha': (1e-2, 1e-3),
}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)



0.6910416666666667
{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


In [76]:
df2 = pd.read_csv('forEvaluating_v2.csv',index_col=0)
df2.tail()

Unnamed: 0,category,predicted_category,text
995,GENERAL,995,japan expanding state emergency entire nation ...
996,GENERAL,996,south african president extends lockdown two w...
997,GENERAL,997,uk report largest oneday death toll coronaviru...
998,POLITICS,998,hill campaign report biden seek counter trump ...
999,GENERAL,999,nigeria issue warning drug touted trump overdo...


In [81]:
y = pd.DataFrame(df2['category'])
X = pd.DataFrame(df2['text'])

In [82]:
text_clf_svm.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
predicted = text_clf_svm.predict((X['text'].values.astype(str)))
np.mean(predicted == y['category'].values.astype(str))

0.582

In [90]:
start_time = datetime.now()
text_clf_svm.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
print('Time taken to train model: ' + str(datetime.now() - start_time) + ' seconds')

Time taken to train model: 0:00:00.169011 seconds


In [91]:
start_time = datetime.now()
predicted = text_clf_svm.predict((X['text'].values.astype(str)))
print('Time taken to apply model: ' + str(datetime.now() - start_time) + ' seconds')

Time taken to apply model: 0:00:00.056816 seconds


In [67]:
df3 = pd.read_json('forPredicting.json')
df3.tail()

Unnamed: 0,text,predicted_category
11666,hong kong highly resilient fight save economy ...,Weizhen Tan
11667,unreasonable china economy shrink 10 first qua...,Evelyn Cheng
11668,u company still betting chinese consumer despi...,Evelyn Cheng
11669,coronavirus live update global case 856900 dea...,Saheli Roy Choudhury
11670,u clinical trial hydroxychloroquine hit warp s...,Berkeley Lovelace Jr.


In [68]:
X = pd.DataFrame(df3['text'])

In [69]:
text_clf_svm.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
predicted = text_clf_svm.predict((X['text'].values.astype(str)))

In [71]:
df4 = pd.read_json('Combined.json')
df4['predicted_category'] = df4['author']
df4.tail()

Unnamed: 0,description,title,url,author,publishedAt,content,source,urlToImage,predicted_category
11666,"Hong Kong is fighting a ""twin battle"" of the c...",Hong Kong is 'highly resilient' as it fights t...,https://www.cnbc.com/2020/04/14/coronavirus-ho...,Weizhen Tan,2020-04-14T06:39:13Z,"Hong Kong's popular nightlife district, Lan Kw...","{'id': 'cnbc', 'name': 'CNBC'}",https://image.cnbcfm.com/api/v1/image/10647610...,Weizhen Tan
11667,An early look at Chinese business conditions i...,'Not unreasonable' for China's economy to shri...,https://www.cnbc.com/2020/03/24/china-beige-bo...,Evelyn Cheng,2020-03-24T06:46:36Z,People wear masks as they cross a street durin...,"{'id': 'cnbc', 'name': 'CNBC'}",https://image.cnbcfm.com/api/v1/image/10642183...,Evelyn Cheng
11668,The American Chamber of Commerce in China's su...,US companies are still betting on Chinese cons...,https://www.cnbc.com/2020/03/26/us-companies-s...,Evelyn Cheng,2020-03-25T23:28:43Z,An employee works next to shoes on display ins...,"{'id': 'cnbc', 'name': 'CNBC'}",https://image.cnbcfm.com/api/v1/image/10598809...,Evelyn Cheng
11669,"Most of the deaths occurred in Europe: 12,428 ...",Coronavirus live updates: Global cases over 85...,https://www.cnbc.com/2020/04/01/coronavirus-li...,Saheli Roy Choudhury,2020-04-01T00:34:44Z,This is a live blog. Please check back for upd...,"{'id': 'cnbc', 'name': 'CNBC'}",https://image.cnbcfm.com/api/v1/image/10646758...,Saheli Roy Choudhury
11670,Researchers at NYU Langone on March 24 launche...,US clinical trials of hydroxychloroquine hit '...,https://www.cnbc.com/2020/04/15/coronavirus-us...,Berkeley Lovelace Jr.,2020-04-15T12:50:57Z,A bottle of Prasco Laboratories Hydroxychloroq...,"{'id': 'cnbc', 'name': 'CNBC'}",https://image.cnbcfm.com/api/v1/image/10648079...,Berkeley Lovelace Jr.


In [72]:
for i, row in df4.iterrows():
    df4.at[i,'predicted_category'] = predicted[i]

In [75]:
df4.tail()

Unnamed: 0,description,title,url,author,publishedAt,content,source,urlToImage,predicted_category
11666,"Hong Kong is fighting a ""twin battle"" of the c...",Hong Kong is 'highly resilient' as it fights t...,https://www.cnbc.com/2020/04/14/coronavirus-ho...,Weizhen Tan,2020-04-14T06:39:13Z,"Hong Kong's popular nightlife district, Lan Kw...","{'id': 'cnbc', 'name': 'CNBC'}",https://image.cnbcfm.com/api/v1/image/10647610...,BUSINESS
11667,An early look at Chinese business conditions i...,'Not unreasonable' for China's economy to shri...,https://www.cnbc.com/2020/03/24/china-beige-bo...,Evelyn Cheng,2020-03-24T06:46:36Z,People wear masks as they cross a street durin...,"{'id': 'cnbc', 'name': 'CNBC'}",https://image.cnbcfm.com/api/v1/image/10642183...,BUSINESS
11668,The American Chamber of Commerce in China's su...,US companies are still betting on Chinese cons...,https://www.cnbc.com/2020/03/26/us-companies-s...,Evelyn Cheng,2020-03-25T23:28:43Z,An employee works next to shoes on display ins...,"{'id': 'cnbc', 'name': 'CNBC'}",https://image.cnbcfm.com/api/v1/image/10598809...,BUSINESS
11669,"Most of the deaths occurred in Europe: 12,428 ...",Coronavirus live updates: Global cases over 85...,https://www.cnbc.com/2020/04/01/coronavirus-li...,Saheli Roy Choudhury,2020-04-01T00:34:44Z,This is a live blog. Please check back for upd...,"{'id': 'cnbc', 'name': 'CNBC'}",https://image.cnbcfm.com/api/v1/image/10646758...,GENERAL
11670,Researchers at NYU Langone on March 24 launche...,US clinical trials of hydroxychloroquine hit '...,https://www.cnbc.com/2020/04/15/coronavirus-us...,Berkeley Lovelace Jr.,2020-04-15T12:50:57Z,A bottle of Prasco Laboratories Hydroxychloroq...,"{'id': 'cnbc', 'name': 'CNBC'}",https://image.cnbcfm.com/api/v1/image/10648079...,SCIENCE


In [74]:
df4.to_json("Combined Predicted.json")