In [17]:
import sklearn as sk
import numpy as np
import matplotlib as plt
import pandas as pd
from pprint import pprint
import random

###Tokenizer

In [2]:
import csv
import re

## emoticons
emoticons_str = r"""
  (?:
    [:=;] # Eyes
    [oO\-]? # Nose (optional)
    [D\)\]\(\]/\\OpP] # Mouth
  )"""
 
## words
regex_str = [
  emoticons_str,
  r'<[^>]+>', # HTML tags
  r'(?:@[\w_]+)', # @-mentions
  r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
  r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs

  r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
  r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
  r'(?:[\w_]+)', # other words
  r'(?:\S)' # anything else
]

## compile regex
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)

def tokenize(string):
	return tokens_re.findall(string)

def removable(token):
	isEmoticon = True if emoticon_re.search(token) else False
	isRemovable = token in [',', '.', ':', ';']
	return (isEmoticon or isRemovable)

# pre_processor
def pre_process(string, lowercase=False):
	tokens = tokenize(string)
	tokens = [ token for token in tokens if not removable(token)]
	return tokens

###Load curated tweets

In [3]:
def load_curated_tweets(csv_file, encoding='latin-1'):
    df_curated = pd.read_csv(csv_file, encoding=encoding)
    df_curated = df_curated[['id', 'tweet', 'class']]
    return df_curated

###Load unlabeled tweets

In [4]:
def load_unlabeled_tweets(csv_file, encoding='latin-1'):
    df_curated = pd.read_csv(csv_file, encoding=encoding)
    df_curated = df_curated[['id', 'tweet']]
    return df_curated

###Write tweets

In [5]:
def write_tweets(csv_file, df):
    pass

### Build classifier

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def build_classifier(df_curated, df_all):
    vec = CountVectorizer(tokenizer=pre_process)
    vec.fit(df_all.tweet)
    bagofwords = vec.transform(df_curated.tweet)
    bagofwords = bagofwords.toarray()
    clf = MultinomialNB().fit(bagofwords, df_curated['class'])
    return vec, clf

###Update classifier

In [7]:
def update_classifier(vec, clf, new_curated):
    bagofwords = vec.fit_transform(df_new_curated.tweet)
    bagofwords = bagofwords.toarray()
    clf = clf.partial_fit(bagofwords, df_new_curated['class'])

###Test partial fit

In [8]:
df_curated.reindex(np.random.permutation(df_curated.index))
train_size = int(len(df_curated) * .75)
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(tokenizer=pre_process)
bagofwords = vec.fit(df_curated.tweet)
bagofwords = bagofwords.toarray()
train = bagofwords[:train_size,:]
test = bagofwords[train_size:,:]
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(train, df_curated[:train_size]['class'])
clf = clf.partial_fit(test, df_curated[train_size:]['class'])
predicted = clf.predict(test)
pd.crosstab(df_curated[train_size:]['class'], predicted,
                      rownames=['actual'], colnames=['predicted'])

NameError: name 'df_curated' is not defined

###Pick uncertain samples

In [70]:
def pick_uncertain_samples(sample_size, batch_size, vec, clf, df_unlabeled):
    # take random sample of unlabeled
    rows = np.random.choice(df_unlabeled.index.values, sample_size)
    df_sample = df_unlabeled.ix[rows]
    sample = vec.transform(df_sample.tweet)
    
    # predict them
    predicted = clf.predict_log_proba(sample)
    predicted = [sum(n) for n in predicted]
    
    df_sample['predicted'] = predicted
    i = np.argpartition(np.array([-n for n in predicted]), batch_size)[:batch_size]
    df_sample = df_sample.iloc[i]
    
    # pick batch_size with least uncertainty
    return df_sample

### Test picker

In [71]:
curated_csv = '../twitfem/curated/davidt1.csv'
unlabeled_csv = '../twitfem/twitfem.csv'
df_curated = load_curated_tweets(curated_csv)
df_unlabeled = load_unlabeled_tweets(unlabeled_csv)
vec, clf = build_classifier(df_curated, df_unlabeled)

In [72]:
sample= pick_uncertain_samples(1000, 1, vec, clf, df_unlabeled)
sample

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,id,tweet,predicted
837928,585887523710205952,"#FIFA ,When will you judge them http://t.co/8m...",-3.299647


### Main

In [34]:
import sys

def main(argv):
    try:
        curated_csv = argv[1]
        batch_size = argv[2]
    except IndexError:
        print('Missing command line argument.')
        
    build_classifier(curated_csv)
    


if __name__ == '__main__':
    main(sys.argv)

SyntaxError: unexpected EOF while parsing (<ipython-input-34-b8c4634ff38c>, line 1)