In [28]:
import pandas as pd
data = pd.read_csv('bbc-news-data.csv', sep="\t")
data

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
...,...,...,...,...
2220,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...
2221,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...
2222,tech,399.txt,Be careful how you code,A new European directive could put software w...
2223,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...


In [29]:
def empty_check(df, col):
    empty = df[df[col].str.strip() == ''].shape[0]
    print("Empty " + col + ": ", empty)
    if(empty > 0):
        df = df[df[col].str.strip() != '']
    return df

data = empty_check(data, 'title')
data = empty_check(data, 'content')

Empty title:  0
Empty content:  0


In [30]:
import nltk 
import numpy as np
import re
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A) # remove .,;:() etc
    doc = doc.lower()
    doc = doc.strip()
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)
norm_data = normalize_corpus(data.title)
data.title = norm_data

norm_data = normalize_corpus(data.content)
data.content = norm_data
data

Unnamed: 0,category,filename,title,content
0,business,001.txt,ad sales boost time warner profit,quarterly profits us media giant timewarner ju...
1,business,002.txt,dollar gains greenspan speech,dollar hit highest level euro almost three mon...
2,business,003.txt,yukos unit buyer faces loan claim,owners embattled russian oil giant yukos ask b...
3,business,004.txt,high fuel prices hit bas profits,british airways blamed high fuel prices drop p...
4,business,005.txt,pernod takeover talk lifts domecq,shares uk drinks food firm allied domecq risen...
...,...,...,...,...
2220,tech,397.txt,bt program beat dialler scams,bt introducing two initiatives help beat rogue...
2221,tech,398.txt,spam emails tempt net shoppers,computer users across world continue ignore se...
2222,tech,399.txt,careful code,new european directive could put software writ...
2223,tech,400.txt,us cyber security chief resigns,man making sure us computer networks safe secu...


In [31]:
data = empty_check(data, 'title')
data = empty_check(data, 'content')

Empty title:  0
Empty content:  0


In [32]:
data['text'] = data['title'] + ' ' + data['content']
data

Unnamed: 0,category,filename,title,content,text
0,business,001.txt,ad sales boost time warner profit,quarterly profits us media giant timewarner ju...,ad sales boost time warner profit quarterly pr...
1,business,002.txt,dollar gains greenspan speech,dollar hit highest level euro almost three mon...,dollar gains greenspan speech dollar hit highe...
2,business,003.txt,yukos unit buyer faces loan claim,owners embattled russian oil giant yukos ask b...,yukos unit buyer faces loan claim owners embat...
3,business,004.txt,high fuel prices hit bas profits,british airways blamed high fuel prices drop p...,high fuel prices hit bas profits british airwa...
4,business,005.txt,pernod takeover talk lifts domecq,shares uk drinks food firm allied domecq risen...,pernod takeover talk lifts domecq shares uk dr...
...,...,...,...,...,...
2220,tech,397.txt,bt program beat dialler scams,bt introducing two initiatives help beat rogue...,bt program beat dialler scams bt introducing t...
2221,tech,398.txt,spam emails tempt net shoppers,computer users across world continue ignore se...,spam emails tempt net shoppers computer users ...
2222,tech,399.txt,careful code,new european directive could put software writ...,careful code new european directive could put ...
2223,tech,400.txt,us cyber security chief resigns,man making sure us computer networks safe secu...,us cyber security chief resigns man making sur...


In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.text, data.category, test_size=0.2, random_state=42)

In [34]:
from gensim.models import Word2Vec

wpt = nltk.WordPunctTokenizer()
tokenized_data = [wpt.tokenize(document) for document in X_train]
model = Word2Vec(tokenized_data, window=5, min_count=1, workers=4)


In [35]:
vector_data = set(model.wv.index_to_key)

X_train_vect = np.array([np.array([model.wv[i] for i in ls if i in vector_data])
                         for ls in X_train])
X_test_vect = np.array([np.array([model.wv[i] for i in ls if i in vector_data])
                         for ls in X_test])

X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))


  X_train_vect = np.array([np.array([model.wv[i] for i in ls if i in vector_data])
  X_test_vect = np.array([np.array([model.wv[i] for i in ls if i in vector_data])


In [36]:
#random forrest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_vect_avg, y_train.values.ravel())
y_pred = rf.predict(X_test_vect_avg)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.49887640449438203


In [37]:
test_text = 'football player Reandy goaled in the match against the other team'
test_text = normalize_document(test_text)
test_text = [test_text]
test_text_vect = np.array([np.array([model.wv[i] for i in ls if i in vector_data])
                            for ls in test_text])
test_text_vect_avg = []
for v in test_text_vect:
    if v.size:
        test_text_vect_avg.append(v.mean(axis=0))
    else:
        test_text_vect_avg.append(np.zeros(100, dtype=float))

rf.predict(test_text_vect_avg)

array(['sport'], dtype=object)

In [38]:
# gradient boosting classifier
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train_vect_avg, y_train.values.ravel())
y_pred = gb.predict(X_test_vect_avg)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.49887640449438203


In [39]:
test_text = 'football player Reandy goaled'
test_text = normalize_document(test_text)
test_text = [test_text]
test_text_vect = np.array([np.array([model.wv[i] for i in ls if i in vector_data])
                            for ls in test_text])
test_text_vect_avg = []
for v in test_text_vect:
    if v.size:
        test_text_vect_avg.append(v.mean(axis=0))
    else:
        test_text_vect_avg.append(np.zeros(100, dtype=float))

gb.predict(test_text_vect_avg)

array(['politics'], dtype=object)

In [40]:
# improve the rf model with grid search cross validation
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [400],
    'max_features': ['auto'],
    'max_depth' : [12],
    'criterion' :['entropy']
}

# param_grid = {
#     'n_estimators': range(50, 500, 50),
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth' : [1,2,4,8,12,16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60],
#     'criterion' :['gini', 'entropy', 'mse', 'mae', 'friedman_mse', 'poisson', 'deviance']
# }
CV_rfc = GridSearchCV(estimator=rf, param_grid=param_grid, cv= 5, verbose=2)
CV_rfc.fit(X_train_vect_avg, y_train.values.ravel())
#print(CV_rfc.best_params_)
y_pred = CV_rfc.predict(X_test_vect_avg)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END criterion=entropy, max_depth=12, max_features=auto, n_estimators=400; total time=  29.4s
[CV] END criterion=entropy, max_depth=12, max_features=auto, n_estimators=400; total time=  37.8s
[CV] END criterion=entropy, max_depth=12, max_features=auto, n_estimators=400; total time=  37.6s
[CV] END criterion=entropy, max_depth=12, max_features=auto, n_estimators=400; total time=  31.8s
[CV] END criterion=entropy, max_depth=12, max_features=auto, n_estimators=400; total time=  36.6s
Accuracy:  0.5146067415730337
