In [111]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import re
import string

# Pre-processing

In [112]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
    0. Remove all links and referneces (@Name ...), digits
    1. Remove all punctuation
    2. Remove all stopwords
    3. Return the cleaned text as a list of words
    4. Convert words ot its infinitve form
    '''
    text = re.sub(r"@\S+", "", text)
    text = re.sub(r"http:\S+", "", text)
    text = re.sub(r"\d\S+", "", text)
    nopunc = [char for char in text if char not in string.punctuation]
    result = ''.join(nopunc)
    
    # result = [word for word in result.split() if word.lower() not in stopwords.words('english')]
    result = word_tokenize(result)

    stemmer = PorterStemmer()
    result = [ stemmer.stem(word) for word in result ]
    result = ' '.join(result)
    return result

In [124]:
def load_data(filename):
    data = pd.read_csv(filename)
    data.columns = ['label', 'text']

    X = data['text'].apply(text_process)
    y = data['label']
    
    return X,y

In [125]:
X_train,y_train = load_data('./training-data/tweets.csv')

X_test,y_test = load_data('./test-data/labelled.csv')

# Training

## Naive Bayes

In [126]:
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
             }

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.6498993963782697
{'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}


## SVM

In [127]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(
                             loss='hinge',
                             penalty='l2',
                             alpha=1e-3,
                             max_iter=5,
                             random_state=42)),
                        ])

parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
              'tfidf__use_idf': (True, False),
              'clf-svm__alpha': (1e-2, 1e-3),
             }

gs_clf_scm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_scm.fit(X_train, y_train)

print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

0.670020120724346
{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 3)}


# Testing

## Naive Bayes

In [128]:
preds = gs_clf.predict(X_test)

In [129]:
np.mean(preds == y_test)

0.5454545454545454

In [130]:
review = text_process("the countryside was beautiful but the surroundings were culture list")
review_transformed = bow_transformer.transform([review])
print(review_transformed)
print(nb.predict(review_transformed)[0])

  (0, 442)	1
  (0, 606)	1
  (0, 617)	1
  (0, 2350)	1
  (0, 3840)	2
  (0, 4347)	1
  (0, 4475)	1
4


## SVM

In [131]:
preds_svm = gs_clf_svm.predict(X_test)

In [132]:
np.mean(preds_svm == y_test)

0.6363636363636364