In [1]:
# Imports
import pickle
import numpy as np
import pandas as pd
import nltk
import string
import re
from nltk.stem.porter import *
from sklearn.externals import joblib
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textstat.textstat import *


## Load helper functions and pickled files

In [7]:
# Helper functions

def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    #parsed_text = parsed_text.code("utf-8", errors='ignore')
    return parsed_text

def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
    #tokens = re.split("[^a-zA-Z]*", tweet.lower())
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

In [8]:
# Load final model files
print "Loading trained classifier... "
model = joblib.load('classifier/final_model.pkl')

print "Loading other information..."
tf_vectorizer = joblib.load('classifier/final_tfidf.pkl')
idf_vector = joblib.load('classifier/final_idf.pkl')
pos_vectorizer = joblib.load('classifier/final_pos.pkl')   

Loading trained classifier... 
Loading other information...




## Preprocess the data

In [10]:
df = pd.read_pickle("data/labeled_data.p")
tweets = df.tweet

In [11]:
fixed_tweets = []
for i, t_orig in enumerate(tweets):
    s = t_orig
    try:
        s = s.encode("latin1")
    except:
        try:
            s = s.encode("utf-8")
        except:
            pass
    if type(s) != unicode:
        fixed_tweets.append(unicode(s, errors="ignore"))
    else:
        fixed_tweets.append(s)
assert len(tweets) == len(fixed_tweets), "shouldn't remove any tweets"
tweets = fixed_tweets
print len(tweets), " tweets to classify"

24783  tweets to classify
