In [125]:
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from collections import defaultdict
from collections import Counter
import math
import time
import csv
import re

In [173]:
# ['favorited', 'retweetCount', 'created', 'text', 'isRetweet', 'truncated', 'retweeted', 'replyToUID', 
# 'longitude', 'replyToSN', 'label', 'replyToSID', 'latitude', 'statusSource', 'favoriteCount', 'id', 'screenName']
with open('all/train.csv', 'rb') as csvfile:
    tweets = csv.DictReader(csvfile)
    tweets = list(tweets)
    l = len(tweets)
    c_index = int(0.9*l)
    train_data = [t['text'] for t in tweets[0:c_index]]
    train_data2 = [(t['text'], t['statusSource']) for t in tweets[0:c_index]]
    validate_content = [t['text'] for t in tweets[c_index:]]
    validate_content2 = [(t['text'], t['statusSource']) for t in tweets[c_index:]]
# with open('all/test.csv', 'rb') as csvfile:
#     tweets = csv.DictReader(csvfile)
#     tweets = list(tweets)
#     print(tweets[0])
#     test_content = [t['text'] for t in tweets]

In [127]:
tfidf = TfidfVectorizer(max_df=0.95,min_df=0.02)

In [128]:
tfidf.fit(train_data)

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.95, max_features=None, min_df=0.02,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [129]:
n_cols = len(tfidf.get_feature_names())

In [130]:
# tfidf.get_feature_names()

# Attempt at Cosine Similarity

In [131]:
word_splitter = re.compile(r"""
    (\w+)
    """, re.VERBOSE)

# sent_words = [word_splitter.findall(sent)
#               for sent in content]

sent_words = [sent.split() for sent in train_data]

#list of words for each tweet
sent_words_lower = [[w.lower() for w in sent]
                    for sent in sent_words]

terms=sorted(set([w for sent in sent_words_lower for w in sent]))

In [132]:
# TF (term frequency) vectorization
# We represent vectors in a "sparse" dictionary format.
# All keys not present in the dictionary are assumed to be zeros.

def doc_to_vec(term_list):
    d = {}
    for v in terms:
        d[v] = term_list.count(v)
    return d

def query_to_vec(term_list):
    d = {}
    for v in terms:
        d[v] = term_list.count(v)
    return d

def dot(d, q):
    sum=0
    for v in d:  # iterates through keys
        sum += d[v] * q[v]
    return sum

def norm(d):
    sum_sq = 0
    for v in d:
        sum_sq += d[v] * d[v]
    return math.sqrt(sum_sq)

def cos_measure(query_words, sentence):
    A = query_to_vec(query_words)
    B = doc_to_vec(sentence)
    if norm(A)==0 or norm(B)==0: return 0
    return float(dot(A, B)) / (norm(A) * norm(B))

In [197]:
from operator import itemgetter

def run_search(query, similarity_measure):
    query_words = [w.lower() for w in query.split()]
    sent_scores = [(sent_words_lower[i], similarity_measure(query_words, sent_words_lower[i]), i)
                   for i in range(len(sent_words_lower))]
    #take top ten scoring sentences
    sent_scores = sorted(sent_scores, key=itemgetter(1), reverse=True)
    top_ten = []
    sent_scores = [(sent, score, index) for sent, score, index in sent_scores[0:10] if score > 0]
#     joined_sents = [(" ".join(sent), score, i) for sent, score, i in sent_scores]
    return sent_scores

In [198]:
x = run_search(validate_content[1], cos_measure)


In [154]:
results = []
for i in range(5):#len(validate_content)):
    top_ten = run_search(validate_content[i], cos_measure)
    a = 0
    ip = 0
    for sent, score, index in top_ten:
        if 'Android' in train_data2[index][1]:
            a += 1
        elif 'iPhone' in train_data2[index][1]:
            ip += 1
    if a > ip:
        results.append((i,'Android'))
    else: 
        results.append((i, 'iPhone'))

In [158]:
#function to calculate accuracy
results

[(0, 'Android'),
 (1, 'Android'),
 (2, 'Android'),
 (3, 'Android'),
 (4, 'Android')]

In [157]:
correct = 0
for r in results:
    if 'Android' in validate_content2[r[0]][1] and 'Android' in r[1]:
        correct += 1
    elif 'iPhone' in validate_content2[r[0]][1] and 'iPhone' in r[1]:
        correct += 1
print(correct)

4


In [167]:
IDF = {}
DF = {}

for t in terms:
    DF[t] = len([1 for sent in sent_words_lower if t in sent])
    IDF[t] = 1 / float(DF[t] + 1)

In [169]:
def doc_to_vec_df(term_list):
    d = {}
    for v in terms:
        d[v] = term_list.count(v) * IDF[v]
    return d

def query_to_vec_df(term_list):
    d = {}
    for v in terms:
        d[v] = term_list.count(v) * IDF[v]
    return d

In [177]:
inverted_index=defaultdict(list)
for doc_idx, doc in enumerate(train_data): # by iterating this way we make sure that doc_ids will always appear sorted in the postings
    for term in doc:
        inverted_index[term].append(doc_idx)
#         if doc_idx not in inverted_index[term]: # this way we have unique doc_ids      
#             inverted_index[term].append(doc_idx) 
inverted_index_tf=defaultdict(list)
for term in inverted_index:
    postings=inverted_index[term]
    if len(postings)>0:
        lastdoc=-1
        i=0
        while i<len(postings):
            tf=0
            lastdoc=postings[i]
            while i<len(postings) and lastdoc==postings[i]:
                tf+=1
                lastdoc=postings[i]
                i+=1
            inverted_index_tf[term].append((lastdoc,tf))

In [176]:
def merge_postings(term1,term2):
    postings1=inverted_index[term1]
    postings2=inverted_index[term2]
    merged_posting=[]
    i,j=0,0
    while i<len(postings1) and j<len(postings2):
        if postings1[i]==postings2[j]:
            merged_posting.append(postings1[i])
            i+=1
            j+=1
        elif postings1[i]<postings2[j]:
            i+=1
        else:
            j+=1
    return merged_posting

In [184]:
def docweight(queryterm, tf):
    return tf*IDF[queryterm]

doc_norms={}
for doc_idx, doc in enumerate(train_data):
    doc_norms[doc_idx]=norm(doc_to_vec(train_data))

In [214]:
## let's use the TF-IDF weighting 
def run_search_on_index(query):
    query_words = [w.lower() for w in query.split()]
    query_weights_dict={}  ## precomputed nonzero query  weights
    large_vector=query_to_vec(query_words)
    for t in large_vector:
        if large_vector[t]>0:
            query_weights_dict[t]=large_vector[t]
    ## these are the score accumulators
    ## they will accumulate accumulate TF-IDF weight products (w_iq * w_ij) 
    doc_scores=defaultdict(int)  
    
    ## since keys of our index are query words, we will iterate through those
    for queryterm in query_words:
        postings=inverted_index_tf[queryterm]
        for doc_idx,tf in postings:   ## only touching docs that are involved in the query
            if queryterm in query_weights_dict.keys():
                weight_update=query_weights_dict[queryterm]*docweight(queryterm,tf)
                doc_scores[doc_idx]+=weight_update  ## accumulate TF-IDF updates here 
            ##(at the end of the outer loop, for each document that has any of these terms
            ##  we will have accumulated all TF-IDF weights that are needed to compute the 
            ## dot-product part of the cosine similarity score

    for d in doc_scores:
        if doc_norms[d] == 0 or norm(query_weights_dict) == 0:
            doc_scores[d] = 0
        else:
            doc_scores[d]=doc_scores[d]/(float(doc_norms[d])*float(norm(query_weights_dict)))  
        ## normalization part of the cosine similarity score
        ## same ORDER if you remove the query norm (every document is divided by it)
    
    doc_idx_scores = sorted(doc_scores.items(), key=itemgetter(1), reverse=True)  ##further optimization possible
    doc_scores = [(train_data[doc_idx], score, doc_idx)
                   for doc_idx, score in doc_idx_scores
                   if score > 0]

    return doc_scores

In [215]:
results = []
for i in range(len(validate_content)):
    top_ten = run_search_on_index(validate_content[i])
    a = 0
    ip = 0
    for sent, score, index in top_ten:
        if 'Android' in train_data2[index][1]:
            a += 1
        elif 'iPhone' in train_data2[index][1]:
            ip += 1
    if a > ip:
        results.append((i,'Android'))
    else: 
        results.append((i, 'iPhone'))

In [216]:
correct = 0
for r in results:
    if 'Android' in validate_content2[r[0]][1] and 'Android' in r[1]:
        correct += 1
    elif 'iPhone' in validate_content2[r[0]][1] and 'iPhone' in r[1]:
        correct += 1
print(correct)

51


In [217]:
len(results)

109