In [81]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

import re
import string

# Pre-processing

In [82]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
    0. Remove all links and referneces (@Name ...), digits
    1. Remove all punctuation
    2. Remove all stopwords
    3. Return the cleaned text as a list of words
    4. Convert words ot its infinitve form
    '''
    text = re.sub(r"@\S+", "", text)
    text = re.sub(r"http:\S+", "", text)
    text = re.sub(r"\d\S+", "", text)
    nopunc = [char for char in text if char not in string.punctuation]
    result = ''.join(nopunc)
    
    # result = [word for word in result.split() if word.lower() not in stopwords.words('english')]
    result = word_tokenize(result)

    stemmer = PorterStemmer()
    result = [ stemmer.stem(word) for word in result ]
    result = ' '.join(result)
    return result

In [83]:
tweets = pd.read_csv('./training-data/tweets.csv')
tweets.columns = ['label', 'text']

tweets['text_length'] = tweets['text'].apply(len)
the_longest_tweets = tweets[tweets['text_length'] == tweets['text_length'].max()]

X_orig = tweets['text']
y = tweets['label']

In [84]:
X_processed = [text_process(text) for text in X_orig]

In [85]:
# Split data to test and training datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=101)

# Training

## Naive Bayes

In [86]:
#text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
#                     ('tfidf', TfidfTransformer()),
#                     ('clf', MultinomialNB()),
#                    ])

text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                     ('clf', MultinomialNB()),
                    ])

text_clf = text_clf.fit(X_train, y_train)

## SVM

In [87]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(
                             loss='hinge',
                             penalty='l2',
                             alpha=1e-3,
                             max_iter=5,
                             random_state=42)),
                        ])

text_clf_svm = text_clf_svm.fit(X_train, y_train)

# Testing

## Naive Bayes

In [88]:
preds = text_clf.predict(X_test)

In [89]:
np.mean(preds == y_test)

0.69

In [90]:
review = text_process("the countryside was beautiful but the surroundings were culture list")
review_transformed = bow_transformer.transform([review])
print(review_transformed)
print(nb.predict(review_transformed)[0])

  (0, 442)	1
  (0, 606)	1
  (0, 617)	1
  (0, 2350)	1
  (0, 3840)	2
  (0, 4347)	1
  (0, 4475)	1
4


## SVM

In [91]:
preds_svm = text_clf_svm.predict(X_test)

In [92]:
np.mean(preds_svm == y_test)

0.7