In [1]:
import pandas as pd
import scipy as sp
import nltk
from nltk.corpus import reuters
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

In [2]:
import xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


In [3]:
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import GridSearchCV

# 1. Dataset Preparation

In [5]:
#Load data from nltk corpus
documents = reuters.fileids()
print(str(len(documents)) + " documents")
train_docs = list(filter(lambda doc: doc.startswith("train"), documents))
print(str(len(train_docs))+ ' total train docs')
test_docs = list(filter(lambda doc: doc.startswith("test"),documents))
print(str(len(test_docs)) + ' total test docs')
categories = reuters.categories()
print(str(len(categories)) + " total categories")
train_y = [reuters.categories(i)[0] for i in train_docs]
test_y = [reuters.categories(i)[0] for i in test_docs]

10788 documents
7769 total train docs
3019 total test docs
90 total categories


In [11]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y_coded = encoder.fit_transform(train_y)
test_y_coded = encoder.fit_transform(test_y)

In [16]:
trainDF = pd.DataFrame()

In [18]:
#To clean '\n' contained in the raw data 
train_n_paragraph, train_texts = [],[]

for i in train_docs:
    train_texts.append(''.join(reuters.raw(i).split('\n')))
    train_n_paragraph.append(reuters.raw(i).count('\n'))
    
trainDF['texts']=train_texts
trainDF['n_paragraph']=train_n_paragraph #count how many paragraphs in one paper by counting the number of '\n', and keep it as a feature

In [19]:
test_n_paragraph, test_texts = [],[]

for i in test_docs:
    test_texts.append(''.join(reuters.raw(i).split('\n')))
    test_n_paragraph.append(reuters.raw(i).count('\n'))
    testDF = pd.DataFrame()
testDF['texts']=test_texts
testDF['n_paragraph']=test_n_paragraph

In [21]:
dataset=pd.concat([trainDF,testDF])#put training data and testing data together can make data processing easier.
dataset.reset_index(inplace=True)

In [22]:
cachedStopWords = stopwords.words("english")

### Remove stop-words

In [23]:
dataset['texts_r_sw']=dataset['texts'].apply(lambda x: [ word  for word in x.split() if word not in cachedStopWords])

In [24]:
dataset['sw_percentage']=dataset['texts_r_sw'].apply(len)/dataset['texts'].apply(lambda x: len(x.split()))

### Word Stemming

In [28]:
stemming=PorterStemmer()
dataset['texts_sw_st']=dataset['texts_r_sw'].apply(lambda x: [stemming.stem(i) for i in x])

### Word tokenization

In [29]:
dataset['texts_tk']=dataset['texts_sw_st'].apply(lambda x: word_tokenize(' '.join(x))) 

In [31]:
dataset['texts_tk']=dataset['texts_tk'].apply(' '.join)

# 2. Feature Engineering

### Count Vector

In [32]:
count_vect_tk = CountVectorizer(analyzer='word', token_pattern = r'\w{1,}')
count_vect_tk.fit(dataset['texts_tk'])
xtrain_count_tk = count_vect_tk.transform(dataset['texts_tk'][:7769])
xtest_count_tk = count_vect_tk.transform(dataset['texts_tk'][7769:])

In [59]:
xtrain_count_tk.shape

(7769, 28633)

### Word Level TF-IDF

In [121]:
# Word level TF-IDF
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(dataset['texts'])
xtrain_tfidf = tfidf_vect.transform(dataset['texts'][:7769])
xtest_tfidf = tfidf_vect.transform(dataset['texts'][7769:])

In [122]:
xtrain_tfidf.shape

(7769, 5000)

### Text based features

In [38]:

dataset['char_count']=dataset['texts'].apply(len)

In [39]:

dataset['word_count']=dataset['texts'].apply(lambda x: len(x.split()))

In [40]:

dataset['word_density']=dataset['char_count']/(dataset['word_count']+1)

In [41]:

dataset['punctuation_count']=dataset['texts'].apply(lambda x: len(''.join(_ for _ in x if _ in string.punctuation)))

In [42]:
#Noun Count; Verb Count; Abjective Count; Pronoun Count; Adverb Count;
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}
# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

dataset['noun_count'] = dataset['texts'].apply(lambda x: check_pos_tag(x, 'noun'))
dataset['verb_count'] = dataset['texts'].apply(lambda x: check_pos_tag(x, 'verb'))
dataset['adj_count'] = dataset['texts'].apply(lambda x: check_pos_tag(x, 'adj'))
dataset['adv_count'] = dataset['texts'].apply(lambda x: check_pos_tag(x, 'adv'))
dataset['pron_count'] = dataset['texts'].apply(lambda x: check_pos_tag(x, 'pron'))

In [78]:
tn_features=['noun_count','verb_count','adj_count','adv_count','pron_count','char_count','word_count','punctuation_count','word_density','n_paragraph','sw_percentage']

In [123]:
xtrain_cb=sp.hstack((xtrain_count_tk.toarray(),xtrain_tfidf.toarray()))
xtest_cb=sp.hstack((xtest_count_tk.toarray(),xtest_tfidf.toarray()))

In [94]:
xtrain_df=pd.DataFrame(xtrain_cb)
xtrain_df[tn_features]=dataset[tn_features][:7769]

In [95]:
xtest_df=pd.DataFrame(xtest_cb)
xtest_df[tn_features]=dataset[tn_features][7769:]

In [96]:
xtrain=sp.sparse.csr_matrix(xtrain_df.values)

In [97]:
xtrain.shape

(7769, 33644)

In [98]:
xtest=sp.sparse.csr_matrix(xtest_df.values)

In [99]:
xtest.shape

(3019, 33644)

# 3. Build the model

In [124]:
classifier = linear_model.LogisticRegression()
classifier.fit(xtrain_cb,train_y)
predict_y=classifier.predict(xtest_cb)
print(metrics.accuracy_score(predict_y, test_y))



0.8834051010268301


In [107]:
classifier = linear_model.LogisticRegression()
classifier.fit(xtrain_count_tk,train_y)
predict_y=classifier.predict(xtest_count_tk)
print(metrics.accuracy_score(predict_y, test_y))



0.8830738655183835


In [117]:
classifier = linear_model.LogisticRegression()
classifier.fit(xtrain_tfidf,train_y)
predict_y=classifier.predict(xtest_tfidf)
print(metrics.accuracy_score(predict_y, test_y))

0.8224577674726731


In [105]:
classifier = linear_model.LogisticRegression()
classifier.fit(xtrain_df,train_y)
predict_y=classifier.predict(xtest_df)
print(metrics.accuracy_score(predict_y, test_y))



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').