In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import sklearn
from sklearn.feature_extraction.text import CountVectorizer

import re
import string

In [2]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
    0. Remove all links and referneces (@Name ...), digits
    1. Remove all punctuation
    2. Remove all stopwords
    3. Return the cleaned text as a list of words
    4. Convert words ot its infinitve form
    '''
    text = re.sub(r"@\S+", "", text)
    text = re.sub(r"http:\S+", "", text)
    text = re.sub(r"\d\S+", "", text)
    nopunc = [char for char in text if char not in string.punctuation]
    result = ''.join(nopunc)
    
    result = [word for word in result.split() if word.lower() not in stopwords.words('english')]

    stemmer = PorterStemmer()
    result = [ stemmer.stem(word) for word in result ]
    result = ' '.join(result)
    return result

In [4]:
tweets = pd.read_csv('./training-data/tweets.csv')
tweets.columns = ['label', 'text']

tweets['text_length'] = tweets['text'].apply(len)
the_longest_tweets = tweets[tweets['text_length'] == tweets['text_length'].max()]

X_orig = tweets['text']
y = tweets['label']

In [5]:
X_processed = [text_process(text) for text in X_orig]

In [7]:
bow_transformer = CountVectorizer(ngram_range=(1,1)).fit(X_processed)
X = bow_transformer.transform(X_processed)
print("Vocabulary size is: ",len(bow_transformer.vocabulary_))

Vocabulary size is:  1651


In [8]:
# Split data to test and training datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [9]:
# Train the classifier
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
preds = nb.predict(X_test)

In [11]:
review = text_process("the countryside was beautiful but the surroundings were culture list")
review_transformed = bow_transformer.transform([review])
print(review_transformed)
print(nb.predict(review_transformed)[0])

  (0, 138)	1
  (0, 858)	1
2
