In [1]:
import pandas as pd
import re
import spacy   
nlp = spacy.load('en_core_web_sm') 

In [2]:
df = pd.read_csv("tweets_grouped.csv")

In [3]:
import re,string

def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

def strip_all_entities(text):
    entity_prefixes = ['@','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

In [4]:
def clean_str(s):
    """Clean sentence"""
    if type(s) is float:
        return s
    if len(s)<1:
        return s
    s = strip_all_entities(strip_links(s))
    s = re.sub(r"[^A-Zæåøa-z0-9#,\??\'\`]", " ", s)
    s = re.sub(r"\'s", " \'s", s)
    s = re.sub(r"\'ve", " \'ve", s)
    s = re.sub(r"n\'t", " n\'t", s)
    s = re.sub(r"\'re", " \'re", s)
    s = re.sub(r"\'d", " \'d", s)
    s = re.sub(r"\'ll", " \'ll", s)
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\(", " \( ", s)
    s = re.sub(r"\)", " \) ", s)
    #s = re.sub(r"\?", " \? ", s)
    # s = nb_tokenizer(s)
    doc = nlp(s,disable=['parser', 'ner'])
    tokens = [tok.text.lower().strip() for tok in doc if tok.pos_ in  ['NOUN','PROPN','ADJ','ADV']]
    tokens = ' '.join(tokens)
    return tokens.strip().lower()


In [5]:
df['text_clean'] = df['text'].map(lambda x: clean_str(x))

Extract Local terms 

In [6]:
def extract_hashtags(tweet):
    return set(re.findall(r"#(\w+)", tweet))


In [7]:
def filter_postags(tweet):
    tweet = clean_str(tweet)
    doc = nlp(tweet) 
    for tok in doc:
        print(tok.pos_, tok.text)

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x=df['text_clean']

In [10]:
y=df['state']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

Model benchmarking

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import numpy as np


In [13]:
target_names = y.unique()

In [14]:
target_names

array(['California', 'Georgia', 'Maryland', 'Massachusetts', 'Minnesota',
       'New York', 'Texas', 'Louisiana', 'North Carolina', 'New Jersey',
       'Arizona', 'Florida', 'Oregon', 'Tennessee', 'Illinois', 'Ohio',
       'Washington', 'Indiana', 'Nevada', 'Michigan', 'Utah',
       'South Carolina', 'Missouri', 'Colorado', 'Pennsylvania',
       'Connecticut', 'Wisconsin', 'Virginia', 'Oklahoma', 'Alabama',
       'Iowa'], dtype=object)

In [15]:
feature_names = []

In [16]:
def vectorize(vect,X_train,X_test,chisquare=False,n_features=10000):
    if vect == "hashing":
        vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False,
                                       n_features=2 ** 16)
        X_train = vectorizer.transform(X_train)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    if vect == "hashing" :
        feature_names = None
    else:
        feature_names = vectorizer.get_feature_names()
    if chisquare:
        ch2 = SelectKBest(chi2, k=n_features)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        if feature_names:
            # keep selected feature names
            feature_names = [feature_names[i] for i
                             in ch2.get_support(indices=True)]
        print("selected features using chi2")
    if feature_names:
        feature_names = np.asarray(feature_names)
        return X_train,X_test,feature_names
    return X_train,X_test

In [17]:
def benchmark(clf,print_top10 = True,print_report = False):
    print("Training: ")
    print(clf)
    clf.fit(X_train, y_train)
    print("trained model")
    pred = clf.predict(X_test)
    print("predicted using model")

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            for i, label in enumerate(target_names):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print("%s: %s" % (label, " ".join(feature_names[top10])))
        print()

    if print_report:
        print("classification report:")
        print(metrics.classification_report(y_test, pred,
                                            target_names=target_names))
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score

In [18]:
#noun, proper noun and adjective
X_train,X_test,feature_names = vectorize("tfidf",X_train,X_test,chisquare=True)
results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
        (Perceptron(max_iter=100), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=100),
         "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (LogisticRegression(n_jobs=1,C=1e5,max_iter=100),"lr"),
        (RandomForestClassifier(), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

selected features using chi2
Ridge Classifier
Training: 
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None, solver='sag',
                tol=0.01)
trained model
predicted using model
accuracy:   0.447
dimensionality: 10000
density: 0.999816
top 10 keywords per class:
California: juse asterisk eagle tuscaloosa birmingham bham bama huntsville auburn alabama
Georgia: sedona asu tucson tempe suns arizona phoenix phx az scottsdale
Maryland: xo angeles ca diego sd california hollywood san sf la
Massachusetts: missrachel nuggets broncos ski rockies mlsdonkey snow boulder colorado denver
Minnesota: sinergyst inactives fairfield norwalk espn2 indigodreamer reaction ct hartford stamford
New York: dade jacksonville bios dolphins bucs fl miami tampa florida orlando
Texas: buckhead hawks alpharetta shawty savannah georgia falcons ga atl atlanta
Louisiana: daley wgn bulls il cta illinois twisten bears chi



trained model
predicted using model
accuracy:   0.350
dimensionality: 10000
density: 1.000000
top 10 keywords per class:
California: midsize eagle chapel klutz birmingham generically dewy workbay damons planetxbox360
Georgia: horry fvf wacked ev1 barbosa ansi kiawah yazoo photograp wey
Maryland: firsttake bagdad moreira roam 4v cassius timepieces ypsilanti broomfield shanell
Massachusetts: favored mankato renegades inthe213 orangeburg nfc charle rhymesayers grigsby clarksville
Minnesota: pardcast sdc stamford ct reaction smoochy hartford bram 4x postively
New York: griese eatn mazzella prudential kgw khartoum burnie papago conneticut heav
Texas: achive westbank okla kasim grigsby numer hennepin latrine fms dundalk
Louisiana: wbbm ecorazzi zins grizzley berrian wgn sano babycenter commanders maddy
North Carolina: umake lampert telephony jvk consulates bolan pintura lifework macgyver abbreviated
New Jersey: rattray kinoki mechanically mclean des ladygoodman avw kimble moines iowa
Arizona



trained model
predicted using model
accuracy:   0.369


In [22]:
#noun, proper noun and adjective
X_train,X_test = vectorize("hashing",X_train,X_test,chisquare=True)
results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
        (Perceptron(max_iter=100), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=100),
         "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (LogisticRegression(n_jobs=1,C=1e5,max_iter=100),"lr"),
        (RandomForestClassifier(), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

selected features using chi2
Ridge Classifier
Training: 
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None, solver='sag',
                tol=0.01)
trained model
predicted using model
accuracy:   0.406
dimensionality: 10000
density: 0.999997
top 10 keywords per class:
California: imitates mobiletwitter unsexy tipps wilhite cmnhospitals wyff interestin mbta thaaaaaaaa
Georgia: omniweb maxam thiopental tinora geeeny alexander81 relays homless 6n ner
Maryland: facebooktips4u rashford earnhardt medicating geekz huffingtonpost mwahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahaha 50f fuglies badgers
Massachusetts: ridiciulous approachin irresistibly ks360 hartford prov1s faaashhooo iamsuperexcitedfortonightthepartyisgonnabesomuchfun guitarworld gonzalez
Minnesota: zander liesel mamola collinearl doctorlady paymah dagg sumfin swett cbweb



trained model
predicted using model
accuracy:   0.352
dimensionality: 10000
density: 1.000000
top 10 keywords per class:
California: berz randallstown mbta dannnggg priorties mobiletwitter bombard cmnhospitals unsexy thaaaaaaaa
Georgia: welcom t2a tinora kevinnottingham geeeny alexander81 ner relays homless 6n
Maryland: chellie rltw vezner trott thechristianauthorsshow malawa herbstriet rashford dilworth badgers
Massachusetts: luce traverus goodmooorrnnniiinnggg pizzabarfor jacksonorleans1 iamsuperexcitedfortonightthepartyisgonnabesomuchfun wisconisn faaashhooo guitarworld gonzalez
Minnesota: lilwil saabsquad thetandd damone swett paymah ruinsinred beady dagg cbwebdesigner
New York: gambrills plummer willaim tofollow minneapolis ariston cadd oooooohhh corvair dreamsz
Texas: uummm fcukin fuma meganne diigo olmenius burgh mattie fripp bartlet
Louisiana: frou hierarch groepschoreografie kavemayne shatnerquake 4show vestival grisham reactivity loseweighttweet
North Carolina: mylilrobotponi

In [68]:
#noun and proper noun
X_train,X_test,feature_names = vectorize("tfidf",X_train,X_test,chisquare=True)
results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
        (Perceptron(max_iter=100), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=100),
         "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (LogisticRegression(n_jobs=1,C=1e5,max_iter=100),"lr"),
        (RandomForestClassifier(), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

selected features using chi2
Ridge Classifier
Training: 
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='sag',
        tol=0.01)
trained model
predicted using model
accuracy:   0.447
dimensionality: 10000
density: 0.999900
top 10 keywords per class:
California: juse asterisk eagle tuscaloosa birmingham bham auburn bama huntsville alabama
Georgia: modexperts asu suns tucson tempe phoenix arizona phx az scottsdale
Maryland: angeles los diego ca sd california hollywood san sf la
Massachusetts: missrachel broncos nuggets ski rockies snow mlsdonkey boulder colorado denver
Minnesota: nagsandbrags inactives fortifiedrecords fairfield norwalk indigodreamer reaction ct hartford stamford
New York: dolphins lauderdale ucf bios bucs fl miami tampa florida orlando
Texas: hawks alpharetta shawty preciate savannah georgia falcons ga atl atlanta
Louisiana: bayless wgn bulls illinois cta il twisten bears c