In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('forModelTraining.csv',index_col=0)
df.tail()

Unnamed: 0,category,text
5995,ENVIRONMENT,kemosabe chatty porcupine eats banana video ke...
5996,ENVIRONMENT,bear camera capture life eye alaskan animal vi...
5997,ENVIRONMENT,first day fall autumnal equinox arrives septem...
5998,ENVIRONMENT,climate change could turn greenland well green...
5999,ENVIRONMENT,diy tshirt project idea reusing old tee networ...


In [3]:
y = pd.DataFrame(df['category'])
X = pd.DataFrame(df['text'])

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])

In [6]:
import numpy as np
text_clf.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
predicted = text_clf.predict(X_test['text'].values.astype(str))
np.mean(predicted == y_test['category'].values.astype(str))

0.6966666666666667

In [7]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

    BUSINESS       0.78      0.59      0.67       216
 ENVIRONMENT       0.70      0.85      0.77       197
     GENERAL       0.57      0.74      0.64       197
    POLITICS       0.69      0.76      0.72       176
     SCIENCE       0.84      0.48      0.61       224
        TECH       0.70      0.82      0.75       190

    accuracy                           0.70      1200
   macro avg       0.71      0.71      0.69      1200
weighted avg       0.72      0.70      0.69      1200



In [8]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, random_state=42,
max_iter=5, tol=None)),
])

text_clf_svm.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
predicted = text_clf_svm.predict(X_test['text'].values.astype(str))
np.mean(predicted == y_test['category'].values.astype(str))

0.695

In [9]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

    BUSINESS       0.80      0.57      0.67       216
 ENVIRONMENT       0.72      0.81      0.77       197
     GENERAL       0.78      0.62      0.69       197
    POLITICS       0.49      0.81      0.61       176
     SCIENCE       0.79      0.58      0.67       224
        TECH       0.73      0.82      0.77       190

    accuracy                           0.69      1200
   macro avg       0.72      0.70      0.70      1200
weighted avg       0.73      0.69      0.70      1200



In [10]:
from sklearn.neural_network import MLPClassifier

text_clf_nn = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf-nn', MLPClassifier(solver='lbfgs', random_state=0)),
])


In [11]:
text_clf_nn.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
predicted = text_clf_nn.predict(X_test['text'].values.astype(str))
np.mean(predicted == y_test['category'].values.astype(str))

0.6816666666666666

In [12]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

    BUSINESS       0.69      0.63      0.66       216
 ENVIRONMENT       0.71      0.76      0.73       197
     GENERAL       0.54      0.73      0.62       197
    POLITICS       0.74      0.70      0.72       176
     SCIENCE       0.72      0.60      0.65       224
        TECH       0.75      0.69      0.72       190

    accuracy                           0.68      1200
   macro avg       0.69      0.68      0.68      1200
weighted avg       0.69      0.68      0.68      1200



In [13]:
from sklearn.model_selection import GridSearchCV
parameters = {
'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf__alpha': (1e-2, 1e-3),
}

In [14]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))



In [15]:
print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.671875
{'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}


In [16]:
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf-svm__alpha': (1e-2, 1e-3),
}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)



0.6847916666666667
{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


In [17]:
df2 = pd.read_csv('forEvaluating.csv',index_col=0)
df2.tail()

Unnamed: 0,category,predicted_category,text
995,GENERAL,name,japan expanding state emergency entire nation ...
996,GENERAL,name,south african president extends lockdown two w...
997,GENERAL,name,uk report largest oneday death toll coronaviru...
998,POLITICS,name,hill campaign report biden seek counter trump ...
999,GENERAL,name,nigeria issue warning drug touted trump overdo...


In [18]:
y = pd.DataFrame(df2['category'])
X = pd.DataFrame(df2['text'])

In [19]:
text_clf_svm.fit(X_train['text'].values.astype(str), y_train['category'].values.astype(str))
predicted = text_clf_svm.predict((X['text'].values.astype(str)))
np.mean(predicted == y['category'].values.astype(str))

0.373

In [20]:
df3 = pd.read_csv('labeled.csv',index_col=0)
df3.tail()

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,category,predicted_category
995,The Hill,John Bowden,Japan expanding state of emergency to entire n...,Japanese Prime Minister Shinzo Abe officially ...,https://thehill.com/policy/international/49309...,https://thehill.com/sites/default/files/blogs/...,2020-04-16T13:02:27Z,Japanese Prime Minister Shinzo Abe officially ...,GENERAL,name
996,The Hill,Marty Johnson,South African president extends lockdown two w...,South African President Cyril Ramaphosa on Fri...,https://thehill.com/policy/international/afric...,https://thehill.com/sites/default/files/ramaph...,2020-04-10T14:29:54Z,South African President Cyril Ramaphosa on Fri...,GENERAL,name
997,The Hill,Zack Budryk,UK reports largest one-day death toll from cor...,The U.K. saw its largest single-day death toll...,https://thehill.com/blogs/blog-briefing-room/n...,https://thehill.com/sites/default/files/uk_cor...,2020-04-01T14:25:54Z,The U.K. saw its largest single-day death toll...,GENERAL,name
998,The Hill,"Julia Manchester,Max Greenwood and Jonathan Ea...",The Hill's Campaign Report: Biden seeks to cou...,"Welcome to The Hill's Campaign Report, your da...",https://thehill.com/homenews/campaign/489104-t...,https://thehill.com/sites/default/files/bidenj...,2020-03-23T21:27:11Z,"Welcome to The Hill's Campaign Report, your da...",POLITICS,name
999,The Hill,Zack Budryk,Nigeria issues warning for drug touted by Trum...,Nigerian health officials on Monday issued a w...,https://thehill.com/policy/healthcare/488972-n...,https://thehill.com/sites/default/files/nigeri...,2020-03-23T13:07:05Z,Nigerian health officials on Monday issued a w...,GENERAL,name


In [21]:
for i, row in df3.iterrows():
    df3.at[i,'predicted_category'] = predicted[i]

In [22]:
df3.tail()

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,category,predicted_category
995,The Hill,John Bowden,Japan expanding state of emergency to entire n...,Japanese Prime Minister Shinzo Abe officially ...,https://thehill.com/policy/international/49309...,https://thehill.com/sites/default/files/blogs/...,2020-04-16T13:02:27Z,Japanese Prime Minister Shinzo Abe officially ...,GENERAL,GENERAL
996,The Hill,Marty Johnson,South African president extends lockdown two w...,South African President Cyril Ramaphosa on Fri...,https://thehill.com/policy/international/afric...,https://thehill.com/sites/default/files/ramaph...,2020-04-10T14:29:54Z,South African President Cyril Ramaphosa on Fri...,GENERAL,GENERAL
997,The Hill,Zack Budryk,UK reports largest one-day death toll from cor...,The U.K. saw its largest single-day death toll...,https://thehill.com/blogs/blog-briefing-room/n...,https://thehill.com/sites/default/files/uk_cor...,2020-04-01T14:25:54Z,The U.K. saw its largest single-day death toll...,GENERAL,ENVIRONMENT
998,The Hill,"Julia Manchester,Max Greenwood and Jonathan Ea...",The Hill's Campaign Report: Biden seeks to cou...,"Welcome to The Hill's Campaign Report, your da...",https://thehill.com/homenews/campaign/489104-t...,https://thehill.com/sites/default/files/bidenj...,2020-03-23T21:27:11Z,"Welcome to The Hill's Campaign Report, your da...",POLITICS,POLITICS
999,The Hill,Zack Budryk,Nigeria issues warning for drug touted by Trum...,Nigerian health officials on Monday issued a w...,https://thehill.com/policy/healthcare/488972-n...,https://thehill.com/sites/default/files/nigeri...,2020-03-23T13:07:05Z,Nigerian health officials on Monday issued a w...,GENERAL,POLITICS


In [None]:
for i, row in df2.iterrows():
    df2.at[i,'category'] = predicted[i]

In [None]:
df2.tail()

In [None]:
df3 = pd.read_csv('Combined.csv')
df3['category'] = df3['Unnamed: 0']
df3 = df3.drop(columns=['Unnamed: 0'])
df3.head()

In [None]:
for i, row in df2.iterrows():
    df3.at[i,'category'] = predicted[i]

In [None]:
df3.tail()

In [None]:
df3.to_csv("Combined With Category.csv")

In [None]:
df4 = pd.read_csv('forEvaluating.csv',index_col=0)
df4.tail()

In [None]:
y = pd.DataFrame(df4['category'])
X = pd.DataFrame(df4['text'])

In [None]:
text_clf_nn.fit(X_train['text'], y_train['topic_area'])
predicted = text_clf_nn.predict((X['text'].values.astype(str)))
np.mean(predicted == y['category'])

In [None]:
for i, row in df4.iterrows():
    df4.at[i,'predicted_category'] = predicted[i]

In [None]:
df4.tail()

In [None]:
df4.to_csv("labelled_and_predicted.csv")

In [None]:
df5 = pd.read_csv('Evaluation Dataset.csv')
df5['predicted_category'] = df5['category']
df5 = df5.drop(columns=['Unnamed: 0'])
df5.tail()

In [None]:
for i, row in df5.iterrows():
    df5.at[i,'predicted_category'] = predicted[i]

In [None]:
np.mean(df5['category'] == df5['predicted_category'])

In [None]:
df5.tail()

In [None]:
df5.to_csv("labelled_and_predicted.csv")