In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('forModelTraining.csv',index_col=0)
df.tail()

Unnamed: 0,category,text
5995,ENVIRONMENT,kemosabe chatty porcupine eats banana video ke...
5996,ENVIRONMENT,bear camera capture life eye alaskan animal vi...
5997,ENVIRONMENT,first day fall autumnal equinox arrives septem...
5998,ENVIRONMENT,climate change could turn greenland well green...
5999,ENVIRONMENT,diy tshirt project idea reusing old tee networ...


In [3]:
y = pd.DataFrame(df['category'])
X = pd.DataFrame(df['text'])

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])

In [9]:
import numpy as np
text_clf.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
predicted = text_clf.predict(X_test['text'].values.astype(str))
np.mean(predicted == y_test['category'].values.astype(str))

0.6958333333333333

In [10]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

    BUSINESS       0.70      0.57      0.63       201
 ENVIRONMENT       0.73      0.81      0.77       197
     GENERAL       0.57      0.75      0.65       194
    POLITICS       0.74      0.77      0.75       201
     SCIENCE       0.86      0.53      0.66       221
        TECH       0.65      0.77      0.71       186

    accuracy                           0.70      1200
   macro avg       0.71      0.70      0.69      1200
weighted avg       0.71      0.70      0.69      1200



In [12]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, random_state=42,
max_iter=5, tol=None)),
])

text_clf_svm.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
predicted = text_clf_svm.predict(X_test['text'].values.astype(str))
np.mean(predicted == y_test['category'].values.astype(str))

0.7058333333333333

In [13]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

    BUSINESS       0.75      0.57      0.65       201
 ENVIRONMENT       0.72      0.75      0.73       197
     GENERAL       0.80      0.61      0.70       194
    POLITICS       0.76      0.69      0.73       201
     SCIENCE       0.61      0.81      0.69       221
        TECH       0.68      0.79      0.73       186

    accuracy                           0.71      1200
   macro avg       0.72      0.70      0.71      1200
weighted avg       0.72      0.71      0.70      1200



In [14]:
from sklearn.neural_network import MLPClassifier

text_clf_nn = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf-nn', MLPClassifier(solver='lbfgs', random_state=0)),
])


In [15]:
text_clf_nn.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
predicted = text_clf_nn.predict(X_test['text'].values.astype(str))
np.mean(predicted == y_test['category'].values.astype(str))

0.6733333333333333

In [16]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

    BUSINESS       0.69      0.60      0.64       201
 ENVIRONMENT       0.69      0.72      0.70       197
     GENERAL       0.55      0.73      0.63       194
    POLITICS       0.77      0.72      0.74       201
     SCIENCE       0.77      0.57      0.65       221
        TECH       0.64      0.73      0.68       186

    accuracy                           0.67      1200
   macro avg       0.68      0.68      0.67      1200
weighted avg       0.69      0.67      0.67      1200



In [17]:
from sklearn.model_selection import GridSearchCV
parameters = {
'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf__alpha': (1e-2, 1e-3),
}

In [19]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))



In [20]:
print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.6825
{'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}


In [21]:
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf-svm__alpha': (1e-2, 1e-3),
}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)



0.690625
{'clf-svm__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


In [22]:
df2 = pd.read_csv('forEvaluating.csv',index_col=0)
df2.tail()

Unnamed: 0,category,predicted_category,text
995,GENERAL,name,japan expanding state emergency entire nation ...
996,GENERAL,name,south african president extends lockdown two w...
997,GENERAL,name,uk report largest oneday death toll coronaviru...
998,POLITICS,name,hill campaign report biden seek counter trump ...
999,GENERAL,name,nigeria issue warning drug touted trump overdo...


In [23]:
y = pd.DataFrame(df2['category'])
X = pd.DataFrame(df2['text'])

In [33]:
text_clf_svm.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
predicted = text_clf_svm.predict((X['text'].values.astype(str)))
np.mean(predicted == y['category'].values.astype(str))

0.368

In [35]:
for i, row in df2.iterrows():
    df2.at[i,'category'] = predicted[i]

In [37]:
df2.tail()

Unnamed: 0,text,category
11666,hong kong highly resilient fight save economy ...,general
11667,unreasonable china economy shrink 10 first qua...,finance
11668,u company still betting chinese consumer despi...,tech
11669,coronavirus live update global case 856900 dea...,general
11670,u clinical trial hydroxychloroquine hit warp s...,science


In [111]:
df3 = pd.read_csv('Combined.csv')
df3['category'] = df3['Unnamed: 0']
df3 = df3.drop(columns=['Unnamed: 0'])
df3.head()

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,category
0,The Times of India,Sarthak Dogra,400 White Hackers Are Quietly Saving World Fro...,A group of white hackers from across the world...,https://www.indiatimes.com/technology/news/400...,https://im.indiatimes.in/content/2020/Mar/hack...,2020-03-27T10:43:30Z,With the Coronavirus outbreak skyrockets depen...,name
1,The Times of India,KC Archana,Good News! Bengaluru Doctor Claims To Be Close...,Click here to read about Bengaluru based doct...,https://www.indiatimes.com/trending/human-inte...,https://im.indiatimes.in/content/2020/Mar/Beng...,2020-03-28T06:25:19Z,"COVID-19, the respiratory disease caused by th...",name
2,The Times of India,Vatsala Devki Vats,"Hrithik Roshan, Ajay Devgn Request Cured COVID...",Using the antibodies of patients who have succ...,https://www.indiatimes.com/entertainment/bolly...,https://im.indiatimes.in/content/2020/Apr/3-FB...,2020-04-20T05:00:19Z,Using the antibodies of patients who have succ...,name
3,The Times of India,ET CONTRIBUTORS,"Large caps should be initial beneficiaries, sa...",The spread of the Covid-19 virus and the meltd...,https://economictimes.indiatimes.com/mf/analys...,"https://img.etimg.com/thumb/msid-75059460,widt...",2020-04-09T05:14:48Z,"By Ravi GopalakrishnanWith the onset of 2020, ...",name
4,The Times of India,"Bhragu Haritas, ET CIO",Hackers begin exploiting COVID-19 situation at...,. As enterprises open up their networks to all...,https://cio.economictimes.indiatimes.com/news/...,https://etimg.etb2bimg.com/thumb/msid-74839021...,2020-03-27T03:12:00Z,The outbreak of COVID-19 has forced companies-...,name


In [44]:
for i, row in df2.iterrows():
    df3.at[i,'category'] = predicted[i]

In [45]:
df3.tail()

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,category
11666,CNBC,Weizhen Tan,Hong Kong is 'highly resilient' as it fights t...,"Hong Kong is fighting a ""twin battle"" of the c...",https://www.cnbc.com/2020/04/14/coronavirus-ho...,https://image.cnbcfm.com/api/v1/image/10647610...,2020-04-14T06:39:13Z,"Hong Kong's popular nightlife district, Lan Kw...",general
11667,CNBC,Evelyn Cheng,'Not unreasonable' for China's economy to shri...,An early look at Chinese business conditions i...,https://www.cnbc.com/2020/03/24/china-beige-bo...,https://image.cnbcfm.com/api/v1/image/10642183...,2020-03-24T06:46:36Z,People wear masks as they cross a street durin...,finance
11668,CNBC,Evelyn Cheng,US companies are still betting on Chinese cons...,The American Chamber of Commerce in China's su...,https://www.cnbc.com/2020/03/26/us-companies-s...,https://image.cnbcfm.com/api/v1/image/10598809...,2020-03-25T23:28:43Z,An employee works next to shoes on display ins...,tech
11669,CNBC,Saheli Roy Choudhury,Coronavirus live updates: Global cases over 85...,"Most of the deaths occurred in Europe: 12,428 ...",https://www.cnbc.com/2020/04/01/coronavirus-li...,https://image.cnbcfm.com/api/v1/image/10646758...,2020-04-01T00:34:44Z,This is a live blog. Please check back for upd...,general
11670,CNBC,Berkeley Lovelace Jr.,US clinical trials of hydroxychloroquine hit '...,Researchers at NYU Langone on March 24 launche...,https://www.cnbc.com/2020/04/15/coronavirus-us...,https://image.cnbcfm.com/api/v1/image/10648079...,2020-04-15T12:50:57Z,A bottle of Prasco Laboratories Hydroxychloroq...,science


In [46]:
df3.to_csv("Combined With Category.csv")

In [141]:
df4 = pd.read_csv('forEvaluating.csv',index_col=0)
df4.tail()

Unnamed: 0,category,text,predicted_category
995,general,japan expanding state emergency entire nation ...,name
996,general,south african president extends lockdown two w...,name
997,general,uk report largest oneday death toll coronaviru...,name
998,general,hill campaign report biden seek counter trump ...,name
999,general,nigeria issue warning drug touted trump overdo...,name


In [142]:
y = pd.DataFrame(df4['category'])
X = pd.DataFrame(df4['text'])

In [143]:
text_clf_nn.fit(X_train['text'], y_train['topic_area'])
predicted = text_clf_nn.predict((X['text'].values.astype(str)))
np.mean(predicted == y['category'])

0.201

In [85]:
for i, row in df4.iterrows():
    df4.at[i,'predicted_category'] = predicted[i]

In [90]:
df4.tail()

Unnamed: 0,category,text,predicted_category
995,general,japan expanding state emergency entire nation ...,business
996,general,south african president extends lockdown two w...,business
997,general,uk report largest oneday death toll coronaviru...,general
998,general,hill campaign report biden seek counter trump ...,general
999,general,nigeria issue warning drug touted trump overdo...,tech


In [87]:
df4.to_csv("labelled_and_predicted.csv")

PermissionError: [Errno 13] Permission denied: 'labelled_and_predicted.csv'

In [128]:
df5 = pd.read_csv('Evaluation Dataset.csv')
df5['predicted_category'] = df5['category']
df5 = df5.drop(columns=['Unnamed: 0'])
df5.tail()

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,category,predicted_category
995,The Hill,John Bowden,Japan expanding state of emergency to entire n...,Japanese Prime Minister Shinzo Abe officially ...,https://thehill.com/policy/international/49309...,https://thehill.com/sites/default/files/blogs/...,2020-04-16T13:02:27Z,Japanese Prime Minister Shinzo Abe officially ...,general,general
996,The Hill,Marty Johnson,South African president extends lockdown two w...,South African President Cyril Ramaphosa on Fri...,https://thehill.com/policy/international/afric...,https://thehill.com/sites/default/files/ramaph...,2020-04-10T14:29:54Z,South African President Cyril Ramaphosa on Fri...,general,general
997,The Hill,Zack Budryk,UK reports largest one-day death toll from cor...,The U.K. saw its largest single-day death toll...,https://thehill.com/blogs/blog-briefing-room/n...,https://thehill.com/sites/default/files/uk_cor...,2020-04-01T14:25:54Z,The U.K. saw its largest single-day death toll...,general,general
998,The Hill,"Julia Manchester,Max Greenwood and Jonathan Ea...",The Hill's Campaign Report: Biden seeks to cou...,"Welcome to The Hill's Campaign Report, your da...",https://thehill.com/homenews/campaign/489104-t...,https://thehill.com/sites/default/files/bidenj...,2020-03-23T21:27:11Z,"Welcome to The Hill's Campaign Report, your da...",general,general
999,The Hill,Zack Budryk,Nigeria issues warning for drug touted by Trum...,Nigerian health officials on Monday issued a w...,https://thehill.com/policy/healthcare/488972-n...,https://thehill.com/sites/default/files/nigeri...,2020-03-23T13:07:05Z,Nigerian health officials on Monday issued a w...,general,general


In [129]:
for i, row in df5.iterrows():
    df5.at[i,'predicted_category'] = predicted[i]

In [130]:
np.mean(df5['category'] == df5['predicted_category'])

0.29

In [131]:
df5.tail()

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,category,predicted_category
995,The Hill,John Bowden,Japan expanding state of emergency to entire n...,Japanese Prime Minister Shinzo Abe officially ...,https://thehill.com/policy/international/49309...,https://thehill.com/sites/default/files/blogs/...,2020-04-16T13:02:27Z,Japanese Prime Minister Shinzo Abe officially ...,general,business
996,The Hill,Marty Johnson,South African president extends lockdown two w...,South African President Cyril Ramaphosa on Fri...,https://thehill.com/policy/international/afric...,https://thehill.com/sites/default/files/ramaph...,2020-04-10T14:29:54Z,South African President Cyril Ramaphosa on Fri...,general,business
997,The Hill,Zack Budryk,UK reports largest one-day death toll from cor...,The U.K. saw its largest single-day death toll...,https://thehill.com/blogs/blog-briefing-room/n...,https://thehill.com/sites/default/files/uk_cor...,2020-04-01T14:25:54Z,The U.K. saw its largest single-day death toll...,general,general
998,The Hill,"Julia Manchester,Max Greenwood and Jonathan Ea...",The Hill's Campaign Report: Biden seeks to cou...,"Welcome to The Hill's Campaign Report, your da...",https://thehill.com/homenews/campaign/489104-t...,https://thehill.com/sites/default/files/bidenj...,2020-03-23T21:27:11Z,"Welcome to The Hill's Campaign Report, your da...",general,general
999,The Hill,Zack Budryk,Nigeria issues warning for drug touted by Trum...,Nigerian health officials on Monday issued a w...,https://thehill.com/policy/healthcare/488972-n...,https://thehill.com/sites/default/files/nigeri...,2020-03-23T13:07:05Z,Nigerian health officials on Monday issued a w...,general,tech


In [107]:
df5.to_csv("labelled_and_predicted.csv")