# LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
import re
from nltk import FreqDist

# FUNCTIONS

In [2]:
def clean_up(s):
    new = re.sub(r'http\S+', ' ', s).replace("'", " ").lower()
    new = re.sub(r"[^a-zA-Z ]", " ", new)
    return new

def tokenize(s):
    new = nltk.word_tokenize(s)
    return new

def stem_and_lemmatize(l):
    ps = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    new = [lemmatizer.lemmatize(ps.stem(w)) for w in l]
    return new

def remove_stopwords(l):
    stop_words = stopwords.words('english')
    new = [x for x in l if x not in stop_words]
    return new

# RUN

In [None]:
# Variables
file = '../sentiment140.csv'
col = 'text'         # column with text to clean
target = 'target'    # column with target
pos = 4              # value used for positive
k = 5000             # sample size
f = 500              # number of features


# Data
ds = pd.read_csv(file)   #, encoding='latin-1', header = None)
ds_sub = ds.sample(k)
ds_sub['text_processed'] = ds_sub.apply(lambda x : remove_stopwords(stem_and_lemmatize(tokenize(clean_up(x[col])))), axis = 1)


# Bag of Words
all_words = [x for lst in ds_sub['text_processed'] for x in lst]
words_freq = nltk.FreqDist(all_words)
bow = [w[0] for w in words_freq.most_common(f)]


# Features
def find_features(document):
    words = set(document)
    features = {}
    for w in bow:
        features[w] = (w in words)
    return features

documents = list(zip(ds_sub['text_processed'], np.where(ds_sub[target] == pos, True, False)))
featuresets = [(find_features(rev), category) for (rev, category) in documents]


# Model
n = int(len(featuresets) / 2)
training_set = featuresets[:n]
testing_set = featuresets[n:]
classifier = nltk.NaiveBayesClassifier.train(training_set)
print(classifier.show_most_informative_features(20))

# Test
print('Accuracy is: ' + nltk.classify.accuracy(classifier, testing_set))

# TEST

In [3]:
tokyo = pd.read_csv('export_content_geocode_35_6894875_139_69170639999993_52km_2016_04_21_21_51_28.csv')
tokyo

FileNotFoundError: [Errno 2] File b'export_content_geocode_35_6894875_139_69170639999993_52km_2016_04_21_21_51_28.csv' does not exist: b'export_content_geocode_35_6894875_139_69170639999993_52km_2016_04_21_21_51_28.csv'