In [4]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import Word
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix


In [5]:
def clean_str(string):
    """
    Tokenization/string cleaning for datasets.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"\'s", "", string)
    string = re.sub(r"\'ve", "", string)
    string = re.sub(r"n\'t", "", string)
    string = re.sub(r"\'re", "", string)
    string = re.sub(r"\'d", "", string)
    string = re.sub(r"\'ll", "", string)
    string = re.sub(r",", "", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", "", string)
    string = re.sub(r"\)", "", string)
    string = re.sub(r"\?", "", string)
    string = re.sub(r"'", "", string)
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"[0-9]\w+|[0-9]","", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [8]:
data = pd.read_csv('./dataset.csv')
x = data['news'].tolist()
y = data['category'].tolist()

for index,value in enumerate(x):
    print("processing data:",index)
    x[index] = ' '.join([Word(word).lemmatize() for word in clean_str(value).split()])

processing data: 0
processing data: 1
processing data: 2
processing data: 3
processing data: 4
processing data: 5
processing data: 6
processing data: 7
processing data: 8
processing data: 9
processing data: 10
processing data: 11
processing data: 12
processing data: 13
processing data: 14
processing data: 15
processing data: 16
processing data: 17
processing data: 18
processing data: 19
processing data: 20
processing data: 21
processing data: 22
processing data: 23
processing data: 24
processing data: 25
processing data: 26
processing data: 27
processing data: 28
processing data: 29
processing data: 30
processing data: 31
processing data: 32
processing data: 33
processing data: 34
processing data: 35
processing data: 36
processing data: 37
processing data: 38
processing data: 39
processing data: 40
processing data: 41
processing data: 42
processing data: 43
processing data: 44
processing data: 45
processing data: 46
processing data: 47
processing data: 48
processing data: 49
processing

processing data: 430
processing data: 431
processing data: 432
processing data: 433
processing data: 434
processing data: 435
processing data: 436
processing data: 437
processing data: 438
processing data: 439
processing data: 440
processing data: 441
processing data: 442
processing data: 443
processing data: 444
processing data: 445
processing data: 446
processing data: 447
processing data: 448
processing data: 449
processing data: 450
processing data: 451
processing data: 452
processing data: 453
processing data: 454
processing data: 455
processing data: 456
processing data: 457
processing data: 458
processing data: 459
processing data: 460
processing data: 461
processing data: 462
processing data: 463
processing data: 464
processing data: 465
processing data: 466
processing data: 467
processing data: 468
processing data: 469
processing data: 470
processing data: 471
processing data: 472
processing data: 473
processing data: 474
processing data: 475
processing data: 476
processing da

In [9]:
x

['claxton hunting first major medal british hurdler sarah claxton is confident she can win her first major medal at next month european indoor championship in madrid the year old ha already smashed the british record over hurdle twice this season setting a new mark of second to win the aaa title i am quite confident said claxton but i take each race a it come a long a i keep up my training but not do too much i think there is a chance of a medal claxton ha won the national hurdle title for the past three year but ha struggled to translate her domestic success to the international stage now the scotland born athlete owns the equal fifth fastest time in the world this year and at last week birmingham grand prix claxton left european medal favourite russian irina shevchenko trailing in sixth spot for the first time claxton ha only been preparing for a campaign over the hurdle which could explain her leap in form in previous season the year old also contested the long jump but since moving

In [11]:
vect = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
X = vect.fit_transform(x)
Y = np.array(y)

print("no of features extracted:",X.shape[1])

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

print("train size:", X_train.shape)
print("test size:", X_test.shape)

model = RandomForestClassifier(n_estimators=300, max_depth=150,n_jobs=1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
c_mat = confusion_matrix(y_test,y_pred)
kappa = cohen_kappa_score(y_test,y_pred)
acc = accuracy_score(y_test,y_pred)
print("Confusion Matrix:\n", c_mat)
print("\nKappa: ",kappa)
print("\nAccuracy: ",acc)

no of features extracted: 5179
train size: (589, 5179)
test size: (148, 5179)
Confusion Matrix:
 [[27  0  0  0  0]
 [ 0 20  1  0  0]
 [ 0  0 61  0  0]
 [ 0  0  0 21  0]
 [ 0  0  0  0 18]]

Kappa:  0.9908687068114511

Accuracy:  0.9932432432432432
