### Train a Sentiment Classifier Using Annotated CrowdFlower Data
A Super Handy CrowdFlower Glossary of Terms can be found [here](https://success.crowdflower.com/hc/en-us/articles/202703305-Glossary-of-Terms)!

In [1]:
import re
import warnings
import numpy as np
import pandas as pd

from nltk.tokenize.casual import TweetTokenizer
from nltk.stem import PorterStemmer

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

pd.set_option("display.max_rows", 500)
pd.set_option('display.max_colwidth', -1)

warnings.filterwarnings("ignore", category=DeprecationWarning)

#### Read in Clean CF Data

In [None]:
cf = read_pickle("data/cf_clean_sentiment.pkl")
cf.head(5)

#### Split Data into Training & Test

In [None]:
train, test = train_test_split(cf, test_size=0.2, random_state=4444)

#### Feature Engineering

In [None]:
def preprocesser(text):
    text = re.sub(r"(?:\https?\://)\S+", "", text) #remove urls
    text = re.sub('\@(\w+)', " ", text).replace(": ","") #remove usernames
    text = re.sub('#(\w+)', " ", text) #remove hashtags
    text = text.replace("RT ","") #remove RT Symbols
    text = text.replace("RT: ","") #remove RT Symbols
    text = re.sub("[^a-zA-Z,]+", " ", text) #remove other non-alpha characters
    text = text.strip(" ") #remove leading and trailing whitespace
    
    return text

def tokenizer(text):
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)    
    return [x for x in tokenizer.tokenize(preprocess(text)) if len(x)>=4]

#### Train Models and Tune Hyperparameters

In [None]:
def train(X, y):
    pass