# Text Classification
After experimentation, in this notebook we perform text classification to an unknown testing dataset,
containing almost 50K documents.
The method consists of Lemmatization using word position tags, and baggings smv classifiers. 
We used bagging classifiers in order to enable svm to run into multiple cores, with the view to reduce the execution time 
of the plain svm.

### In the end it achieves accuracy up to 0.9652

In [1]:
import pandas as pd
import time
import pickle

train_path = "files/data/train.csv"
predicitions_path = 'files/data/predictions.csv'
test_path = "files/data/test_without_labels.csv"
x_vectors_path = "files/serialized/vectors"
test_vectors_path = "files/serialized/tets_vectors"

## Pickle Store and load
Used in order to store and load the produced vector

In [None]:
def pickle_store(obj, filename):
    with open(filename, 'wb') as file:
        pickle.dump(obj, file)

def pickle_load(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

## Evaluation using 5-Fold Cross Validation

In [2]:
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

def evaluation(clf, clf_name, X, y, k=5):
    starting_tm = time.time()
    clf_precision = 0
    clf_recall = 0
    clf_f1 = 0
    clf_accuracy = 0
    
    skf = StratifiedKFold(n_splits=k)
    for train_index, test_index in skf.split(X, y):
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        
        clf_precision += metrics.precision_score(y_test, predictions, average='micro')
        clf_recall += metrics.recall_score(y_test, predictions, average='micro')
        clf_f1 += metrics.f1_score(y_test, predictions, average='micro')
        clf_accuracy += metrics.accuracy_score(y_test, predictions)
    
     # compute the average of each value
    precision_score = clf_precision/k
    recall_score = clf_recall/k
    f1_score = clf_f1/k
    accuracy_score = clf_accuracy/k
    
    print(clf_name + "\nPrecision: " + str(precision_score)
          + "\nRecall: " + str(recall_score)
          + "\nF1-Measure: " + str(f1_score) 
          + "\nAccuracy: " + str(accuracy_score)
          + "\nExecution time: " + str(time.time() - starting_tm))

### Loading Data from Local store

In [3]:
train = pd.read_csv(train_path)
train.head()

Unnamed: 0,Id,Title,Content,Label
0,227464,"Netflix is coming to cable boxes, and Amazon i...",if you subscribe to one of three rinky-dink (...,Entertainment
1,244074,"Pharrell, Iranian President React to Tehran 'H...","pharrell, iranian president react to tehran '...",Entertainment
2,60707,Wildlife service seeks comments,the u.s. fish and wildlife service has reopen...,Technology
3,27883,Facebook teams up with Storyful to launch 'FB ...,the very nature of social media means it is o...,Technology
4,169596,Caesars plans US$880 mln New York casino,caesars plans us$880 mln new york casino jul ...,Business


In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

X = (train['Title']+ " ")*5 + train['Content']
y = le.fit_transform(train['Label'])

## Pre-process using Lemmatization

Applying Lemmatization using position tags. We use position tags in order to enable lemmatization, 
not only to nouns but also to all other parts of speech. Also removing stopwords, punctuations and non alpha characters.

In [6]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
from gensim.utils import simple_preprocess

lmtzr = WordNetLemmatizer()

def nltk2wn_tag(nltk_tag):
  if nltk_tag.startswith('J'):
    return wordnet.ADJ
  elif nltk_tag.startswith('V'):
    return wordnet.VERB
  elif nltk_tag.startswith('N'):
    return wordnet.NOUN
  elif nltk_tag.startswith('R'):
    return wordnet.ADV
  else:    
      return None

my_stopwords = ENGLISH_STOP_WORDS.union(stopwords.words('english'))\
    .union(['include', 'way', 'work', 'look', 'add', 'time', 'year', 'month', 'day', 'help', 'think', 'tell', 'new', 'said', 'say','need', 'come', 'good', 'set', 'want', 'people', 'use', 'day', 'week', 'know'])

my_stopwords_lemma = set()
for word, nltk_tag in nltk.pos_tag(my_stopwords):
    tag = nltk2wn_tag(nltk_tag)
    if tag is not None:
        my_stopwords_lemma.add(lmtzr.lemmatize(word, tag))
    else:
        my_stopwords_lemma.add(word)
        

def documents_preprocess(documents):
    new_documents = []
    starting_tm = time.time()
    for doc in documents:
        clean_doc = []
        doc_tokens = simple_preprocess(doc, deacc=True)
        for word, nltk_tag in  nltk.pos_tag(doc_tokens):
            tag = nltk2wn_tag(nltk_tag)
            if tag is not None:
                lemma = lmtzr.lemmatize(word, tag)
                if lemma not in my_stopwords_lemma:
                    clean_doc.append(lemma)
            else:
                if word not in my_stopwords:
                    clean_doc.append(word)
        new_documents.append(clean_doc)
    
    print("Text Preprocessing took: " + str(time.time() - starting_tm))
    return new_documents

In [12]:
X = documents_preprocess(X)

Text Preprocessing took: 1380.5122604370117


## Text pre-process and Vectorization
We use the hashing trick and then tf-idf transformer in order to convert words frequencies into TF-IDF values. 

In [13]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 

vectorizer = HashingVectorizer(n_features=100000, lowercase=False, tokenizer=lambda x: x)
tfidf_transformer = TfidfTransformer()

starting_tm = time.time()
vtrain = vectorizer.fit_transform(X)
vtrain = tfidf_transformer.fit_transform(vtrain)
print("Vectorization time: " + str((time.time() - starting_tm)))

pickle_store(vtrain, x_vectors_path)



Vectorization time: 5.9197611808776855


In [10]:
vtrain = pickle_load(x_vectors_path)

In [13]:
from sklearn.ensemble import BaggingClassifier
from sklearn import svm

n = 8
svm_clf = svm.SVC(gamma=0.1, C=10, kernel='rbf')
bagging_clf = BaggingClassifier(svm_clf, n_estimators=n, max_samples=1/n, n_jobs=n)

evaluation(bagging_clf, "Baggings SVM",  vtrain, y)

Baggings SVM
Precision: 0.9648642604767655
Recall: 0.9648642604767655
F1-Measure: 0.9648642604767655
Accuracy: 0.9648642604767655
Execution time: 1137.402369260788


---
## Training and Predicting
First load the testing set, clean it and vectorize it, and then fit the classifier to the 
training set and predict the testing set.

In [14]:
test = pd.read_csv(test_path)
X_test = (test['Title']+ " ")*5 + test['Content']

X_test = documents_preprocess(X_test)
starting_tm = time.time()
vtest = vectorizer.transform(X_test)
vtest = tfidf_transformer.transform(vtest)
print("Vectorization time: " + str((time.time() - starting_tm)))

pickle_store(vtest, test_vectors_path)

Text Preprocessing took: 590.4956665039062
Vectorization time: 2.601217031478882


In [16]:
vtest = pickle_load(test_vectors_path)

In [18]:
from sklearn.ensemble import BaggingClassifier
from sklearn import svm

n = 5
svm_clf = svm.SVC(gamma=0.1, C=10, kernel='rbf')
bagging_clf = BaggingClassifier(svm_clf, n_estimators=n, max_samples=1/n, n_jobs=n)

bagging_clf.fit(vtrain, y)
predictions = bagging_clf.predict(vtest)

predictions = le.inverse_transform(predictions)

## Saving predictions as CSV

In [None]:
test = pd.read_csv(test_path)
predictions_df = pd.DataFrame(data={'Id': list(test['Id']), 'Predicted':predictions})
predictions_df.to_csv(predicitions_path, index=False)