In [2]:
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
from gensim import corpora, models, matutils
import numpy as np
from sklearn import metrics
from operator import itemgetter
from collections import Counter
import json
import gzip
from pprint import pprint
import os
import re

In [3]:
def read_tweets_from_file(category, results_path):
    switcher = {
        "ua": "ua.txt",
        "aa": "aa.txt",
        }
    file_name = switcher.get(category)
    with open(results_path + file_name, 'rb') as f:
        tweets = f.read().splitlines()
    
    f.close()
    return tweets

In [16]:
def Clustering_Tweets(category, path, exp, cluster_size, dup_removed=0,threshold=0.8,limit=10):
    
    tweets_all = read_tweets_from_file(category, path)
    tweet_and_id = []
    
    topics_dic = {}

    for line in tweets_all:
        try:
            tid = line.split(' ', 1)[0]
            ttext = line.split(' ', 1)[1].replace('\r','')
            ttuple = tid,ttext
            tweet_and_id.append(ttuple)
        except IndexError:
            print(line)
            pass
        continue
            
    tweets = []
    ref = []
    for i,e in enumerate(tweet_and_id):
        if e[1] not in tweets:
            tweets.append(e[1])
            stuple = i,e[0],e[1]
            ref.append(stuple)
    stoplist = set([line.replace("\n","") for line in open('/Users/alinamazi/Data/stopword-list2.txt')])
    pun = [',', '-','.', ':', '(', ')', '--', ';', '...', 'just','right','','today','follow','stats','can', 'say', 'says', 'will','may', 'must', 'us', 'via','a','the', 'rt', 'gg', 'gt', 'lt', 'la', 'de', 'te', 'lol', 'follow', 'followers', 'unfollow', 'unfollowers', 'unfollower', 'follower']

    with open('/Users/alinamazi/Data/common_list.txt','rt') as myfile:
        common_word = myfile.read().splitlines()

    #Tokenize
    tknzr = TweetTokenizer()   
    tweet_count = 0
    tweet_tokens=[]
    tweet_list=[]
    i = 0

    for line in tweets:

        new_l = tknzr.tokenize(line)
        lower_l = [e.lower() for e in new_l]
        new_ll = [e for e in lower_l if e not in stoplist and e not in common_word and e not in pun and len(e) > 2 and e.isalpha() and 'http' not in e and '@' not in e]
        
        #removing duplicate after tokenizing
        if (int(exp)==3):
            if (new_ll not in tweet_tokens):
                if len(new_ll) > 0:
                    tweet_tokens.append(new_ll)
                    tweet_count += 1
                    line_id = str(ref[i][1]) + " " + str(line) 
                    tweet_list.append(line_id)

        else:
            if len(new_ll) > 0:
                tweet_tokens.append(new_ll)
                tweet_count += 1
                line_id = str(ref[i][1]) + " " + str(line) 
                tweet_list.append(line_id)
            
        i += 1


    print "Done with tokenizations. tweets count: " , tweet_count
    
    dictionary = corpora.Dictionary(tweet_tokens)
    print(dictionary)

    corpus = [dictionary.doc2bow(t) for t in tweet_tokens]

    n_cluster = cluster_size
    
    # LSI computing
    if (int(exp)==1):
        lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=n_cluster) # initialize an LSI transformation
        corpus_lsi = lsi[corpus]
    else:
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]


        lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=n_cluster) # initialize an LSI transformation
        corpus_lsi = lsi[corpus_tfidf]
       
    j = 0
    lsi_topic_labels=[]
    for i,doc in enumerate(corpus_lsi):
        a = np.array(doc)
        absA= abs(a)
        if (len(a)==0):
            cluster_id=0
            prob_value=0.0
            j=j+1
        else:
            try:
                topicmax = absA.argmax(axis=0)
            except ValueError:
                print "ValueError at"
                print i
            cluster_id = a[topicmax.item(1)][0]
            prob_value = abs(a[topicmax.item(1)][1])

        triple = i, cluster_id, prob_value
        lsi_topic_labels.append(triple)
        
        i=i+1

   
    ##For silhouette score
    labels = [b for a,b,c in lsi_topic_labels]
    np_labels = np.asarray(labels)

    # convert corpus to array
    corpusAsMatrix = matutils.corpus2dense(corpus_lsi, num_terms=n_cluster).transpose()

    score = metrics.silhouette_score(corpusAsMatrix, np_labels, metric='cosine')

    print 'silhouette score is: {}'.format(score)
    
    ###MAKING READY FOR sorting, categorizing and writing into file
    temp = sorted(lsi_topic_labels,key=itemgetter(1,2), reverse=True)
    cluster_topic_list = [e for e in temp if float(e[2]) > 0.2]

    cluster_list = sorted(cluster_topic_list,key=lambda x: x[1])
    counter = Counter(b for a,b,c in cluster_list)
    counter_sort = counter.most_common(n_cluster)
    cid_sort = [e[0] for e in counter_sort]

    print 'n_cluster is {}'.format(n_cluster)

    myfile = open(path + category + '_exp' + str(exp) + '_topics_v' + str(dup_removed)+ '_c' + str(cluster_size) + '_id' + str(threshold) + '_s' + str(limit)+'.txt', 'wt')
    output_file = open(path + category + '_exp' + str(exp) + '_clusters_v' + str(dup_removed)+ '_c' + str(cluster_size) + '_id' + str(threshold) + '_s' + str(limit)+'.txt', 'w')

    def jdefault(o):
        return o.__dict__

    json_file = open(path + category + '_exp' + str(exp) + '_v' + str(dup_removed)+ '_c' + str(cluster_size) +'_id' + str(threshold) + '_s' + str(limit)+'.json', 'w')


    data_list=[]
    data = {}
    second_dic = {}
    count=0
    i=0
    cluster_count = 0
    
    for i in range(0, len(cid_sort)):
        sublist = [e for e in cluster_topic_list if e[1] == cid_sort[i]]
        t_index = int(cid_sort[i])

        topic = lsi.print_topic(t_index,topn=10)
        
        ##topic word preparation
        new_list = [a  for a in (topic.split('+'))]
        new_list = "".join(new_list)

        prob_list = re.findall(r"[-+]?\d*\.\d+|\d+", new_list)

        topic_word = re.findall(r"[^\W\d_]+", new_list)

        topic_tuple = zip(prob_list,topic_word)

        neg_prob = [float(p) for p in prob_list if float(p) < 0]
        pos_prob = [float(p) for p in prob_list if float(p) > 0]
        #print sum(neg_prob), sum(pos_prob)

        if len(pos_prob) > len(neg_prob) or abs(sum(pos_prob)) > abs(sum(neg_prob)):
            t_words = [w for p,w in topic_tuple if float(p) > 0]
        else:
            t_words = [w for p,w in topic_tuple if float(p) < 0]

        all_t_words = [w for p,w in topic_tuple]
        all_t = ' '.join(all_t_words)

        topic = ' '.join(t_words)
    
        
        string = ' '.join(topic)
        if string == "":
            print i
            print cid_sort[i], t_index
            print topic
            #print sublist

            break
     
        if (int(dup_removed) == 0):
            l = [(float(ss[2]), tweet_list[ss[0]].split(' ', 1)) for ss in sublist if float(ss[2]) > float(threshold)]
         
        else:
            l = []
            for ss in sublist:
                if float(ss[2]) > float(threshold):
                    temp = [e[1] for e in l]
                    if tweet_list[ss[0]].split(' ', 1) not in temp:
                        l.append((float(ss[2]), tweet_list[ss[0]].split(' ', 1)))
                                
        count += len(l)
        if len(l) < 10: continue
        else: cluster_count += 1
        myfile.write('Topic #%d' %t_index + '(size %d):' %len(l) +' %s'  %topic +'\n')
        
        output_file.write("\n\n")
        output_file.write('Cluster # %d' %t_index + '-----------------------\n')
        output_file.write('Total items # %d' %len(l) + '-----------------------\n\n')
        output_file.write('Topic: %s' %topic +'\n')
        output_file.write("".join("%s %s %s\n" % (tup[0],tup[1][0],tup[1][1]) for tup in l))

        data = {}
        
        data['clusters']= [[e[1][0]] for e in l]
        data['topic']=topic
        
        second_dic['T'+str(i)]=data

        print('Cluster # %d' %t_index + '-----------------------\n')
        print('Total items # %d' %len(l) + '-----------------------\n\n')
        print('Topic: %s' %topic+'\n')
        print "".join("%s %s %s\n" % (tup[0], tup[1][0],tup[1][1]) for tup in l)
    
    topics_dic['topics'] = second_dic
    json_data = json.dumps(topics_dic, default=jdefault)
    json_file.writelines(json_data)
        
    json_file.close()
    
    print("Total number of tweets is {}".format(count))
    print("Total number of clusters is {}".format(cluster_count))
    
    output_file.write('Total tweets being clustered is # %d' %count)
    output_file.close()


In [17]:
#######################

In [42]:
Clustering_Tweets("aa","/Users/alinamazi/Data/results/April/23-26-5th/",1,5,0,1.0,10)

Done with tokenizations. tweets count:  167
Dictionary(625 unique tokens: [u'results', u'helped', u'llc', u'captain', u'hate']...)
silhouette score is: 0.235157087445
n_cluster is 5
Cluster # 0-----------------------

Total items # 13-----------------------


Topic: flight airlines american mystery remains diverted attendant stroller crew passenger

2.08004004323 rt @ruhtyt: altercation on video grounds american airlines employee | american airlines stroller #americanairlinesstroller https://t.co/id0…
2.03107992509 rt @aunewse: illness that diverted american airlines flight remains a mystery #americanairlines #americanairlines https://t.co/dl7kxrwc7e
2.03107992509 rt @amyrightside: illness that diverted american airlines flight remains a mystery #americanairlines #americanairlines https://t.co/mr8w1uc…
1.95743908291 illness that diverted american airlines flight remains a mystery #americanairlines #americanairlines https://t.co/nhbumrchdf
1.95743908291 illness that diverted american ai

In [49]:
Clustering_Tweets("nw","/home/ynh3/Documents/ExS/results/recent-tweets/Experiments_set1_json/",1,20,0)

Done with tokenizations. tweets count:  1218
Dictionary(2965 unique tokens: [u'limited', u'writings', u'child', u'yellow', u'whoopi']...)
silhouette score is: 0.0421918220818
n_cluster is 20
Cluster # 0-----------------------

Total items # 1131-----------------------


Topic: trump president russia flynn donald fake conference campaign media just

3.97022073799 @cnn shoulda read: -trump news conference  -trump news conference -trump news conference -trump news conference -tr https://t.co/zqig9ste2q
2.91029864541 4 'winner,' trump  doing lot of losing @cnn https://t.co/sac4zorxvr the only person who ever said trump was a winner was trump! loser don!
2.2062529899 rt @cnnpolitics: president donald trump calls the treatment of first lady melania trump unfair https://t.co/rezmjniurv https://t.co/y3p98hc
2.16404972452 rt @nprpolitics: president trump "i inherited a mess;" here is what america looked like when trump took office: 
2.16154671325 rt @breakingnews24u: muslim-american olympiad de

In [72]:
#def Clustering_Tweets(category, path, exp, cluster_size, dup_removed=0,threshold=0.8,limit=10)
Clustering_Tweets("nw","/home/ynh3/Documents/ExS/results/recent-tweets/Experiments_set2_clusterwithincluster/",1,20,1,0.6,5)


Done with tokenizations. tweets count:  521
Dictionary(1564 unique tokens: [u'limited', u'writings', u'child', u'protest', u'aides']...)
silhouette score is: -0.119404993951
n_cluster is 20
Cluster # 0-----------------------

Total items # 481-----------------------


Topic: trump president russia flynn fake donald campaign conference russian leaks

3.75549535373 @cnn shoulda read: -trump news conference  -trump news conference -trump news conference -trump news conference -tr https://t.co/zqig9ste2q
2.68913171962 4 'winner,' trump  doing lot of losing @cnn https://t.co/sac4zorxvr the only person who ever said trump was a winner was trump! loser don!
2.25020450991 rt @cnnpolitics: president donald trump calls the treatment of first lady melania trump unfair https://t.co/rezmjniurv https://t.co/y3p98hc
2.20310111851 rt @startelegram: candidate trump loved leaks. president trump is not a big fan of them. (via @nytimes) https://t.co/muoedjezqu https://t.c
2.19825739338 rt @breakingnews24

In [12]:
Clustering_Tweets("nw","/home/ynh3/Documents/ExS/results/categorized_tweets_enhanced_1h/",1,10,0)


Done with tokenizations. tweets count:  40
Dictionary(217 unique tokens: [u'bomb', u'just', u'diplomacy', u'hands', u'mission']...)
silhouette score is: 0.276387751102
n_cluster is 10
Cluster # 0-----------------------

Total items # 4-----------------------


Topic: women arrested bomb attacks build planning laden allegedly osama idolized

583792863512670209 rt @cnnsitroom: 2 women arrested for allegedly planning to build bomb for us attacks; idolized osama bin laden http://t.co/gvwjztfxke http:
 583795510106009601 rt @cnnsitroom: 2 women were arrested &amp; accused of planning to build a bomb for attacks in the united states: http://t.co/gvwjztxyio http:/
 583794344089354241 rt @cnnbrk: 2 new york women arrested in isis-inspired bomb plot for attacks in the u.s. http://t.co/kq43qxhfsk
 583787628992028673 rt @ajam: prosecutors: two nyc women arrested on terror-related charges http://t.co/8ee1t2mu03

Cluster # 1-----------------------

Total items # 3-----------------------


Topic: re

In [102]:
Clustering_Tweets("nw","/home/ynh3/Documents/ExS/results/categorized_tweets_and_id_3rdweek/",1,50,0,0.8,10)

Done with tokenizations. tweets count:  5137
Dictionary(8787 unique tokens: [u'raining', u'unscientific', u'yellow', u'woods', u'exorcists']...)
silhouette score is: 0.0562768392265
n_cluster is 50
Cluster # 2-----------------------

Total items # 26-----------------------


Topic: obama president list cuba terrorism white deal iran

2.08105206657 588066909586792448 obama endorses removing cuba from terrorism list: president obama intends to remove cuba from t... http://t.co/l4zkgba0hr (via @nytimes)
1.62306522841 588329812793634816 rt @bbcbreaking: us president barack obama plans to remove cuba from state sponsors of terrorism list, white house says http://t.co/rfd079v
1.50917225968 588424914434043904 rt @cnnbrk: president obama has submitted request to congress to take cuba off terror sponsor list, white house national security spokesper
1.19692889889 588057245935611904 rt @yahoonews: breaking news: president obama to remove #cuba from state terror list - via @ap #travel
1.1864422106