## 1. Preliminaries:
By: Patrick Warren (patrick.lee.warren@gmail.com)

Sections
1. Preliminaries
    1. Load packages and set some parameters
    2. Define topic and dates
2. Hydrate Tweets and Gather Them Together in a Topic (Only Need to Do This Once)
3. Define Account/Tweet Labels of Interest
    1. Transform account/tweet stats into labels
4. Pre-Specified Conversation Approach
    1. In the cross-section
    2. Over time
5. Hashtag-Based Unsupervised Approach
    1. Gather top hashtags
    2. Weekly outliers in type share
    3. Daily outliers in type share
6. NLP Unsupervised Approach
    1. Narrow dataset for NLP-- Time and language
    2. Clean up, stem, and vectorize (TF-IDF weighted bag of words)
    3. Apply k-means clustering 
    4. Apply LDA classification
    5. Visualize convo as "discovered" in k-mean/LDA
7. Hunt Graph Creation
    1. Specify targets by terms and time
    2. Gather data and build visualization of last 200 tweets

### 1.A Load Packages; Set some parameters

In [None]:
import http
from http import client
import csv
import re
import datetime
import numpy as np
import pandas as pd
import random
from time import sleep
import json

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
from matplotlib.gridspec import GridSpec

%matplotlib inline

In [None]:
SMALL_SIZE = 10
MEDIUM_SIZE = 18
BIGGER_SIZE = 24

plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
#plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title


In [None]:
def get_hashtags(s):
    return [i  for i in s.split() if i.startswith("#") ]

In [None]:
#### Specific credentials for our Twitter API Access. 
# This flow will as a function of access/capabilities
####

import pickle
import tweepy
import os

if not os.path.exists('/home/pwarren/local_trolls/secret_twitter_credentials.pkl'):
    Twitter={}
    Twitter['Consumer Key'] = ''
    Twitter['Consumer Secret'] = ''
    Twitter['Access Token'] = ''
    Twitter['Access Token Secret'] = ''
    with open('/home/pwarren/local_trolls/secret_twitter_credentials.pkl','wb') as f:
        pickle.dump(Twitter, f)
else:
    Twitter=pickle.load(open('/home/pwarren/local_trolls/secret_twitter_credentials.pkl','rb'))


## Authorizing an application to access Twitter account data

auth = tweepy.OAuthHandler(Twitter['Consumer Key'], Twitter['Consumer Secret'])
auth.set_access_token(Twitter['Access Token'], Twitter['Access Token Secret'])


twitter_api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)


### 1.B Define Topic Stem and Dates

In [None]:
# stem is short-hand for topic. Often, it is the word that we use to cast the wide net. If the topic definition is wider,
# it's short-hand for that. It is also the stem in the file structure, that we use to "sum up" over dates from the raw 
# data files.
#
# In general, our raw data files are of the form [stem]_[date].csv, and are exports from the Social Studio
# platform. 
####

stem='debates_keyhashtags'

In [None]:
### When the topic search includes several files, each which includes data for one or more dates, this variable captures a list 
# of the dates use to aggregate the individual files

dates=['0901_0922']


## 2. Hydrate SS Tweets on  Topic by Date

In [None]:
cd /scratch2/pwarren/hunt/


In [None]:
###############
# Bring in the individual SS files, hydrate them with Twitter API (or appropriate), and save the hydrated stem/date .jsons
# 
#########

running_tweets=pd.DataFrame()

for date in dates:
    file_encoding = 'utf-8'        # set file_encoding to the file encoding (utf8, latin1, etc.)
    input_fd = open(stem+'_'+date+'.csv', encoding=file_encoding, errors = 'backslashreplace')
    social_studio_tweets=pd.read_csv(input_fd, sep=',', dtype=str)

    social_studio_tweets_tomatch=social_studio_tweets[social_studio_tweets['EXTERNAL_ID'].notnull()]['EXTERNAL_ID'].drop_duplicates()

    test=social_studio_tweets_tomatch[0:100].tolist()
    api_tweets=twitter_api.statuses_lookup(test)

    for i in range(100, len(social_studio_tweets_tomatch), 100):
        test =social_studio_tweets_tomatch[i:i+100].tolist()
        try: api_tweets.extend(twitter_api.statuses_lookup(test))
        except tweepy.TweepError:
            print ('TweepError')
            sleep(30)
        
    json_data = [r._json for r in api_tweets]
    api_tweets_pd = pd.json_normalize(json_data)

    social_studio_tweets.rename(columns={'EXTERNAL_ID':'id_str'}, inplace=True)
    full_tweets=social_studio_tweets.merge(api_tweets_pd, how='left', on='id_str')
    full_tweets.to_json(stem+'_'+date+'_full.json')
   

#### Load Hydrated Data by Data, narrow to analysis columns, join narrowed data, and save it.

In [None]:
cd /scratch2/pwarren/hunt/

In [None]:
# Twitter API results are quite wide and we are only interested in a handful of the fields. This list defines that set of fields.

narrow_cols=['created_at','id_str','in_reply_to_status_id','lang','POST_TYPE','retweeted_status.id','retweeted_status.created_at','retweeted_status.user.created_at','retweeted_status.user.statuses_count','retweeted_status.user.verified','source','text','user.created_at','user.default_profile_image','user.description','user.favourites_count','user.followers_count','user.friends_count','user.geo_enabled','user.id_str','user.listed_count','user.location','user.name','user.screen_name','user.statuses_count','user.verified']


In [None]:
#Bring in the hydrated data for each date, limit to narrow columns, and append them to a master narrow topic DataFrame 

topic_narrow=pd.DataFrame()
for date in dates:
    topic=pd.read_json(stem+'_'+date+'_full.json')
    topic_narrow=topic_narrow.append(topic[narrow_cols])


In [None]:
topic.head(1)

In [None]:
topic_narrow.reset_index(drop=True,inplace=True)

In [None]:
print(topic_narrow.shape)
topic_narrow.head(3)

In [None]:
#Export master narrow topic dataframe as .json, to be stored for future analysis

topic_narrow.to_json(stem+'_narrow.json')


In [None]:
topic_narrow.to_csv(stem+'_narrow.csv')

## 3. Define and Calculate Account/Tweet Labels

In [None]:
#Bring in the Narrow Data
topic_narrow=pd.read_json(stem+'_narrow.json', convert_dates=['user.created_at','created_at'],dtype={'id_str':'str','user.id_str':'str'})

In [None]:
topic_narrow['source'].value_counts().head(5)

### 3.A Transform  account/tweet metrics into labels

In [None]:
# Indicator for API getting tweet text, since Social Studio often has records, without full text of deleted tweets
fulltext_narrow=(topic_narrow['text'].notnull())

In [None]:
topic_narrow['user.created_at'].isnull().value_counts()

In [None]:
topic_narrow['birth_date']=pd.to_datetime(topic_narrow['user.created_at'], unit='ms')

In [None]:
#Calculate various derivative tweet and accounts statistics

topic_narrow['date_time']=pd.to_datetime(topic_narrow['created_at'], format="%Y-%b-%d %H:%M:%S+0000 ")
topic_narrow['date']=topic_narrow['date_time'].dt.normalize()
topic_narrow['month']=topic_narrow['date_time'].dt.month
topic_narrow['week']=topic_narrow['date_time'].dt.week
topic_narrow['year']=topic_narrow['date_time'].dt.year
#topic['hour']=topic['date_time'].dt.hour
#topic['minute']=topic['date_time'].dt.minute
topic_narrow['second']=topic_narrow['date_time'].dt.second
topic_narrow['year_week']=topic_narrow['year']*100+topic_narrow['week']

topic_narrow['hashtags']=topic_narrow['text'].astype(str).apply(get_hashtags)

topic_narrow['days_from_birth']=(topic_narrow['date']-topic_narrow['birth_date'])/np.timedelta64(1, 'D')
topic_narrow['follower_per_update']=topic_narrow['user.followers_count'].astype('float')/topic_narrow['user.statuses_count'].astype('float')
topic_narrow['follower_per_friend']=topic_narrow['user.followers_count'].astype('float')/topic_narrow['user.friends_count'].astype('float')
topic_narrow['tweets_per_day']=topic_narrow['user.statuses_count'].astype('float')/topic_narrow['days_from_birth']
topic_narrow['short_content']=topic_narrow['text'].str.replace(r'http.*', 'http')
topic_narrow['is_verified']=topic_narrow['user.verified']==True
topic_narrow['word_count'] = topic_narrow['text'].str.split().str.len()


In [None]:
#Identify original tweets that repeat vertbatim and how many of those tweets are verified (Consider near-verbatim)

repeats=topic_narrow[topic_narrow['retweeted_status.id'].isnull()][['is_verified','id_str','short_content']].groupby('short_content').agg({'id_str':'count','is_verified':'sum'}).reset_index().rename(columns={'id_str':'repeats_count','is_verified':'verified_repeat'})
topic_narrow=topic_narrow.merge(repeats, how='left', on='short_content')

In [None]:
topic_narrow['is_verified'].dtype

In [None]:
## Key Tweet/Account Label Definitions:
# Deleted -- appears in SS but not in Twitter API
# Verified 
# Egg -- Default Profile Image
# First -- Tweet occurs in first second of the minute; indicator of automation.
# Flood -- Tweet is from a non-verified user and is an exact duplicate of another tweet and that 
#          tweet is not duplicated by a verified user
# Bot -- Account with more than 320 tweets per day
# Baby -- Account with 100 tweets or fewer
# Train -- Account with follower count within 10% of  following count, at least 1000 friends, and at least 100 tweets
# Troll -- Non-verified user with at least 1000 friends 100 tweets, and "too many" followers per tweet >0.75 . 


topic_narrow['deleted']=topic_narrow['id_str'].isnull()
topic_narrow['verified']=(topic_narrow['user.verified']==True)
topic_narrow['egg']=(topic_narrow['user.default_profile_image']==True)
topic_narrow['first']=((topic_narrow['user.verified']==False)
                &(topic_narrow['second']==1)
               )
topic_narrow['flood']=((topic_narrow['user.verified']==False)
                &(topic_narrow['POST_TYPE']!='RETWEET')
                &(topic_narrow['repeats_count']>2)
                 &(topic_narrow['verified_repeat']==0)
                     &(topic_narrow['word_count']>2)
               )
topic_narrow['bot']=((topic_narrow['user.verified']==False)
                &(topic_narrow['tweets_per_day']>320)
               )
topic_narrow['baby']=((topic_narrow['user.verified']==False)
            &(topic_narrow['user.statuses_count'].astype('float')<=100) 
            )
topic_narrow['odd_client']=~((topic_narrow['source'].str.contains('<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',na=False))
            |(topic_narrow['source'].str.contains('<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>',na=False)) 
            |(topic_narrow['source'].str.contains('<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',na=False))
            |(topic_narrow['source'].str.contains('<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>',na=False))
            )

topic_narrow['train']=((topic_narrow['user.verified']==False) 
            &(topic_narrow['user.statuses_count'].astype('float')>100) 
            &(topic_narrow['user.friends_count'].astype('float')>1000) 
            &(topic_narrow['follower_per_friend']>.9)
             &(topic_narrow['follower_per_friend']<1.1)
            )
topic_narrow['troll']=((topic_narrow['user.verified']==False)
            &(topic_narrow['user.statuses_count'].astype('float')>100) 
            &(topic_narrow['user.friends_count'].astype('float')>1000) 
            &(topic_narrow['user.statuses_count'].astype('float')<10000) 
            &(topic_narrow['follower_per_update']>.75) 
            )

#topic_narrow['punct']=((topic_narrow['user.verified']==False)
#            &(topic_narrow['text'].str.contains(u"\u2019")) 
#                      )
    
topic_narrow['any']=topic_narrow['troll']|topic_narrow['train']|topic_narrow['baby']|topic_narrow['bot']|topic_narrow['flood']|topic_narrow['first']|topic_narrow['deleted']

In [None]:
label_list=['troll' ,'train' ,'baby' ,'bot' , 'flood','first','verified','egg','odd_client']

In [None]:
## define dictionary for groupby() aggregation in conversations approach, below.

group_dict={'id_str':'count'}
group_dict['date']='first'
for type in label_list:
    group_dict[type]='mean'

# 4. Pre-Specified Conversations Approach

In [None]:
#Define a set of conversations by providing a list of terms, where any tweet containing the term is in the convo
conversations=['trust','mail','home','vote']

### 4.A In the Cross-section

In [None]:
#Visualize share of tweets in each convo, and overall, with each label

means=topic_narrow[label_list].mean()
plt.figure(figsize=(20,10))
barwidth=.8/len(conversations)

plt.bar([x-.4-barwidth for x in range(len(label_list))], means, width=barwidth, align='center', alpha=0.5, label='Overall')

i=0
for convo in conversations:
    print(convo)
    print(topic_narrow[topic_narrow['text'].str.contains(convo, na=False, case=False)].shape)
    means=topic_narrow[topic_narrow['text'].str.contains(convo, na=False, case=False)][label_list].mean()
    

    plt.bar([x-.4 + i*barwidth for x in range(len(label_list))], means, width=barwidth, align='center', alpha=0.5, label=convo)
    plt.xticks([x for x in range(len(label_list))], label_list,  fontsize=20)
    i=i+1

    
plt.legend(fontsize=20)
plt.show()



### 4.B Over Time Daily and/or Weekly

In [None]:
# Cut conversations into days, and calculate label shares for each convo x day, as well as counts.

topic_narrow['date']=topic_narrow['date_time'].dt.date
daily_stats=topic_narrow[fulltext_narrow][label_list+['date','id_str']].groupby('date').agg(group_dict).rename(columns={'id_str':'tweet_count'})
daily_stats['convo']='Topic'
for convo in conversations:
    convo_screen=topic_narrow['text'].str.contains(convo, na=False, case=False)
    convo_stats=topic_narrow[fulltext_narrow&convo_screen][label_list+['date','id_str']].groupby('date').agg(group_dict).rename(columns={'id_str':'tweet_count'})
    convo_stats['convo']=convo
    daily_stats=daily_stats.append(convo_stats)

daily_stats['log_count']=np.log(daily_stats['tweet_count'])

In [None]:
daily_stats.shape

In [None]:
# Weekly version

weekly_stats=topic_narrow[fulltext_narrow][label_list+['year_week','id_str','date']].groupby('year_week').agg(group_dict).rename(columns={'id_str':'tweet_count'})
weekly_stats['convo']='Overall'
for convo in conversations:
    convo_screen=topic_narrow['text'].str.contains(convo, na=False, case=False)
    convo_stats=topic_narrow[fulltext_narrow&convo_screen][label_list+['year_week','date','id_str']].groupby('year_week').agg(group_dict).rename(columns={'id_str':'tweet_count'})
    convo_stats['convo']=convo
    weekly_stats=weekly_stats.append(convo_stats)

weekly_stats['log_count']=np.log(weekly_stats['tweet_count'])
weekly_stats.index=weekly_stats['date']

In [None]:
weekly_stats.shape

In [None]:
#For axis ticks on timelines

years = mdates.YearLocator()   # every year
months = mdates.MonthLocator()  # every month
days = mdates.DayLocator()  # every month
hours = mdates.HourLocator()  # every month
yearsFmt = mdates.DateFormatter('%Y')

In [None]:
#Daily Graphs, one frame for each account type and an overall tweet (log) count

min_tweet=10
fig, ax=plt.subplots(len(label_list)+1,1,figsize=(20,20))

convo_list=daily_stats['convo'].drop_duplicates().sort_values()
for i in range(0,len(label_list)):
        for convo in convo_list:
            ax[i].plot(daily_stats[(daily_stats['convo']==convo)&(daily_stats['tweet_count']>min_tweet)][label_list[i]].sort_index(), alpha=.7, label=convo)
        ax[i].set_title("Share "+label_list[i], fontsize=12)
        ax[i].tick_params(axis='both', which='major', labelsize=10)

        #ax[i].legend()
        #Adjust these if there are many days
        ax[i].xaxis.set_minor_locator(days)
        ax[i].xaxis.set_major_locator(months)
        ax[i].xaxis.set_major_formatter(mdates.DateFormatter('%b'))

for convo in convo_list:
    ax[len(label_list)].plot(daily_stats[daily_stats['convo']==convo]['log_count'].sort_index(), alpha=.7, label=convo)

###Adjust these if there are many days###    
ax[len(label_list)].xaxis.set_minor_locator(days)
ax[len(label_list)].xaxis.set_major_locator(months)
ax[len(label_list)].xaxis.set_major_formatter(mdates.DateFormatter('%b'))


ax[len(label_list)].tick_params(axis='both', which='major', labelsize=10)
ax[len(label_list)].set_title('Log(Tweet Count)', fontsize=12)
ax[0].legend()

fig.tight_layout(pad=.5)

#ax[i].set_ylim([0,.2])
#plt.savefig('stat_timeline.png')

In [None]:
#Weekly Version
min_tweet=30

fig, ax=plt.subplots(len(label_list)+1,1,figsize=(20,20))

convo_list=weekly_stats['convo'].drop_duplicates().sort_values()
for i in range(0,len(label_list)):
        for convo in convo_list:
            ax[i].plot(weekly_stats[(weekly_stats['convo']==convo)&(weekly_stats['tweet_count']>min_tweet)][label_list[i]].sort_index(), alpha=.7, label=convo)
        ax[i].set_title("Share "+label_list[i], fontsize=12)
       #ax[i].legend()
        ax[i].xaxis.set_major_locator(months)
        #ax[i].xaxis.set_minor_locator(months)
        ax[i].tick_params(axis='both', which='major', labelsize=10)
        ax[i].xaxis.set_major_formatter(mdates.DateFormatter('%b'))

for convo in convo_list:
    ax[len(label_list)].plot(weekly_stats[weekly_stats['convo']==convo]['log_count'].sort_index(), alpha=.7, label=convo)
ax[len(label_list)].xaxis.set_major_locator(months)
#ax[len(label_list)].xaxis.set_minor_locator(months)
ax[len(label_list)].tick_params(axis='both', which='major', labelsize=10)
ax[len(label_list)].xaxis.set_major_formatter(mdates.DateFormatter('%b'))
ax[len(label_list)].set_title('Log(Tweet Count)', fontsize=12)

fig.tight_layout(pad=.5)

ax[0].legend()
#ax[i].set_ylim([0,.2])
#plt.savefig('stat_timeline.png')

## 5. Hashtag Detection Approach 

### 5.A Gather Top Hashtags in Topic

In [None]:
#Package for counting
import collections


# Define the minimum times a hashtag needs to appear in the overall topic in order or on the day to include it in the analysis

min_size_week=50
min_size_day=50

In [None]:
#regularize capitalization

topic_narrow['lower_text']=topic_narrow['text'].str.lower()

In [None]:
#Remove punctuation and gather hashtags as list

topic_narrow['lower_hashtags']=topic_narrow['lower_text'].astype(str).replace('\,|\.', '',regex=True).apply(get_hashtags)

In [None]:
#concatenate list of hashtags, for counting

hashtags = [x for l in topic_narrow[topic_narrow['lang']=='en'].lower_hashtags.values for x in l]

In [None]:
#Counter object

counter = collections.Counter(hashtags)


In [None]:
#top-X most common hashtags, overall, with counts
testtemp=counter.most_common(200)

In [None]:
#list just the hashtags, dropping a hashtag that matches the stem that defined the topic, if appropriate

long_convo=[i[0] for i in testtemp if ('#'+stem.lower()!=(str(i[0])))]

In [None]:
long_convo

In [None]:
stem.lower()

### 5.B Weekly Outliers

In [None]:
#By week, by top-200 (overall) hashtag, calculate share of tweets with each label

weekly_detection=pd.DataFrame()
for convo in long_convo:
    convo_screen=topic_narrow['lower_hashtags'].apply(lambda x: convo in x )
    convo_stats=topic_narrow[fulltext_narrow&convo_screen][label_list+['year_week','date','id_str']].groupby('year_week').agg(group_dict).rename(columns={'id_str':'tweet_count'})
    convo_stats['convo']=convo
    weekly_detection=weekly_detection.append(convo_stats)


In [None]:
#number of hashtag x weeks combos that clear the minimum requirement set by min_size_week

weekly_detection[weekly_detection['tweet_count']>min_size_week].shape

In [None]:
#For each week, calculate the mean/sd of shares of each label across the qualifying hashtags, as well as for the whole set of weeks

weekly_means=weekly_detection[weekly_detection['tweet_count']>min_size_week].groupby('year_week').mean()
weekly_std=weekly_detection[weekly_detection['tweet_count']>min_size_week].groupby('year_week').std()
overall_std=weekly_detection[weekly_detection['tweet_count']>min_size_week].std()
overall_mean=weekly_detection[weekly_detection['tweet_count']>min_size_week].mean()

In [None]:
#Calcuate how many standard-deviations away from the overall mean each week x label share is.

for type in label_list:
    weekly_detection[type+'norm']=(weekly_detection[type]-overall_mean[type])/overall_std[type]

In [None]:
#For each label, output all the qualifying hashtags x week shares that are more than 2 sd above the mean.

for type in label_list:
    print(type)
    print(weekly_detection[(weekly_detection['tweet_count']>min_size_week)&(weekly_detection[type+'norm']>2)].sort_index()[['convo',type,'tweet_count']])
    print('')

In [None]:
print(overall_mean)
print(overall_std)

In [None]:
#collecting means and sd by week
weekly_outliers=weekly_means.merge(weekly_std, on='year_week',how='left')
    

In [None]:
#calculating the outlier thresholds by week and label
for type in label_list:
    weekly_outliers[type+"_plus_2sd"]=weekly_outliers[type+'_x']+2*weekly_outliers[type+'_y']

In [None]:
#building panel at the convo x week level, for qualifying convos
weekly_det2=weekly_detection[weekly_detection['tweet_count']>min_size_week].merge(weekly_outliers, on='year_week',how='left').reset_index()

In [None]:
weekly_det2.head(4)

In [None]:
#Output quick text file of outliers [stem].txt

file1 = open(stem+".txt","w")#write mode 
for week in weekly_det2['year_week'].drop_duplicates().sort_values():
    file1.write('Week: '+str(week)+'\n') 
    print('Week: '+str(week))
    for type in label_list:
        hashlist=weekly_det2[(weekly_det2[type]>weekly_det2[type+'_plus_2sd'])&(weekly_det2['year_week']==week)]['convo'].to_list()
        if len(hashlist)>0:
            file1.write(type+': '+str(hashlist)+'\n')
            print(type+': '+str(hashlist))

file1.close() 


### 5.C Daily Outliers

In [None]:
#By day, by top-200 (overall) hashtag, calculate share of tweets with each label


daily_detection=pd.DataFrame()
for convo in long_convo:
    convo_screen=topic_narrow['lower_hashtags'].apply(lambda x: convo in x )
    convo_stats=topic_narrow[fulltext_narrow&convo_screen][label_list+['date','id_str']].groupby('date').agg(group_dict).rename(columns={'id_str':'tweet_count'}).reset_index(drop=True)
    convo_stats['convo']=convo
    daily_detection=daily_detection.append(convo_stats)


In [None]:
#For each day, calculate the mean/sd of shares of each label across the qualifying hashtags, as well as for the whole set of days


daily_means=daily_detection[daily_detection['tweet_count']>min_size_day].groupby('date').mean()
daily_std=daily_detection[daily_detection['tweet_count']>min_size_day].groupby('date').std()
d_overall_std=daily_detection[daily_detection['tweet_count']>min_size_day].std()
d_overall_mean=daily_detection[daily_detection['tweet_count']>min_size_day].mean()

In [None]:
daily_means

In [None]:
#Calcuate how many standard-deviations away from the overall mean each day  x label share is.


for type in label_list:
    daily_detection[type+'norm']=(daily_detection[type]-d_overall_mean[type])/d_overall_std[type]

In [None]:
daily_detection[daily_detection['tweet_count']>min_size_day].shape

In [None]:
#For each label, output all the qualifying hashtags x day shares that are more than 2 sd above the mean.

for type in label_list:
    print(type)
    print(daily_detection[(daily_detection['tweet_count']>min_size_day)&(daily_detection[type+'norm']>2)].sort_index()[['convo',type,'tweet_count']])
    print('')

In [None]:
print(d_overall_mean)
print(d_overall_std)

In [None]:
#put together daily means and std. dev.
daily_outliers=daily_means.merge(daily_std, on='date',how='left')
    

In [None]:
#define what qualifies as a 2 Std.Dev. daily outlier for each label
for type in label_list:
    daily_outliers[type+"_plus_2sd"]=daily_outliers[type+'_x']+2*daily_outliers[type+'_y']

In [None]:
#Build day x convo panel for qualifying hashtags

daily_det2=daily_detection[daily_detection['tweet_count']>min_size_day].merge(daily_outliers, on='date',how='left').reset_index()

In [None]:
#Output quick text file of outliers [stem]_d.txt


file1 = open(stem+"_d.txt","w")#write mode 
for day in daily_det2['date'].drop_duplicates().sort_values():
    file1.write('Date: '+str(day)+'\n') 
    print('Date: '+str(day))
    for type in label_list:
        hashlist=daily_det2[(daily_det2[type]>daily_det2[type+'_plus_2sd'])&(daily_det2['date']==day)]['convo'].to_list()
        if len(hashlist)>0:
            file1.write(type+': '+str(hashlist)+'\n')
            print(type+': '+str(hashlist))

file1.close() 


## 6. Unsupervised Conversation Detection with NLTK

In [None]:
import nltk

In [None]:
topic_narrow['week'].value_counts()

### 6.A Narrow Dataset for NLP-- Time and Language

In [None]:
#NLP really needs a common languange and consistent meanings. Zoom in on specific langauge/time. This depends on Twitter's
#Language guess. It's not great.

dataset=topic_narrow[(topic_narrow['lang']=='en')].drop_duplicates('text').copy()

In [None]:
topic_narrow['lang'].value_counts().head(5)

In [None]:
dataset.shape

### 6.B Clean, Stem, and Vectorize Text

In [None]:
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    tweet = re.sub('\n', ' ',tweet) # remove [links]
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    return tweet

In [None]:
dataset['clean_text']=dataset['text'].apply(remove_links)

In [None]:
dataset['clean_text']=dataset['clean_text'].apply(remove_users)

In [None]:
#Stem words from text to combine variants of a word into common stem, using NLTK's Snowball Stemmer

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
dataset['stemmed'] = dataset.clean_text.map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))
dataset.stemmed.head()

In [None]:
dataset.stemmed.shape

In [None]:
# Starting with the CountVectorizer/TfidfTransformer approach... to build subset of words that really stand out to use in a bag
# Options:
# min_df -- min share of tweets a word needs to show up in to be included
# max_df -- min share of tweets a word needs to show up in to be included
# ngram_range(min,max) -- range of n-gram lengths to include

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text

my_stop_words = text.ENGLISH_STOP_WORDS.union(["https",'amp', stem])

cvec = CountVectorizer(stop_words=my_stop_words, min_df=.001, max_df=.75, ngram_range=(1,2))
cvec

In [None]:
cvec

In [None]:
cvec_counts = cvec.fit_transform(dataset.stemmed)

print( 'sparse matrix shape:', cvec_counts.shape)
print ('nonzero count:', cvec_counts.nnz)
print ('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

In [None]:
#Top words

occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)

In [None]:
# Now, weight the words for prediction where we especially focus on the words that are 'odd', in terms of showing up rarely in the overall corpus 

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_counts)
transformed_weights

In [None]:
#List of features

feature_names = cvec.get_feature_names()
len(feature_names)

### 6.C K-Means Clustering 

In [None]:
from sklearn.cluster import MiniBatchKMeans

In [None]:
# Consider various cluster counts, and see how well clustering works when we do mini-batches at each K
# Then, display how well the k-means fit (in terms of SSE), for each choice of k
k_max=50


K = range(1,k_max)
SSE = []
dSSE=[]
for k in K:
    kmeans = MiniBatchKMeans(n_clusters = k,batch_size = 1000)
    kmeans.fit(transformed_weights)
    SSE.append(kmeans.inertia_)
    if k>2:
        dSSE.append(SSE[k-1]-SSE[k-2])
import matplotlib.pyplot as plt
plt.plot(K,SSE,'bx-')
plt.title('Elbow Method')
plt.xlabel('cluster numbers')
plt.show()

In [None]:
#Easier to read in differences. Looking for a point where the SSE improvement stops getting better very fast.

plt.plot(range(0,k_max-3),dSSE[0:k_max],'bx-')
plt.title('d Elbow Method')
plt.xlabel('cluster numbers')
plt.show()

In [None]:
#Define the k we're going to use

k=10
kmeans = MiniBatchKMeans(n_clusters = k, batch_size=1000)


In [None]:
#Present the words that are closest to the center of each cluster.

kmeans.fit(transformed_weights)
centers = kmeans.cluster_centers_.argsort()[:,::-1]

for i in range(0,k):
    word_list=[]
    print("cluster%d:"% i)
    for j in centers[i,:15]:
        word_list.append(feature_names[j])
    print(word_list)

In [None]:
#Calculate predicted cluster for each tweet in dataset

clusters_predict=kmeans.predict(transformed_weights)

In [None]:
dataset['k_cluster']=clusters_predict

In [None]:
dataset['k_cluster'].value_counts()

### 6.D Latent Dirichlet Allocation to Topics

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

#Define LDA model:
# Parameters:
# number_of_topics -- How many clusters to create (we currently use whatever looked best in the k-means)
#

number_of_topics = k

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

In [None]:
#Fit the TF-IDF transformed BOW model, using LDA algorithm with default parameters 
#(https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html)

model.fit(transformed_weights)

In [None]:
# Function to display top features in each topic

def display_topics(model, f_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(f_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [None]:
#Actually do the display

no_top_words = 10
display_topics(model, feature_names, no_top_words)

In [None]:
# Create Document — Topic Matrix
lda_output = model.transform(transformed_weights)
# column names
topicnames = ["Topic" + str(i) for i in range(model.n_components)]
# index names
docnames = ["Doc" + str(i) for i in range(len(dataset['text']))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic


In [None]:
dataset=dataset.reset_index(drop=True).join(df_document_topic['dominant_topic'].reset_index(drop=True))

In [None]:
dataset['dominant_topic'].value_counts()

### 6.E Visualize/Analyze k-Mean/LDA Clusters

In [None]:
## Merge back

merged_topic=topic_narrow.merge(dataset[['text','k_cluster','dominant_topic']], how='left',on='text')


In [None]:
merged_topic[['text','k_cluster','dominant_topic']].head(5)

In [None]:
#K-Mean Cross-Section Visualization

plt.figure(figsize=(20,10))
barwidth=.9/k
for i in range(0,k):

    means=merged_topic[merged_topic['k_cluster']==i][label_list].mean()
    

    plt.bar([x-.5 + i*barwidth for x in range(len(label_list))], means, width=barwidth, align='center', alpha=0.5)
    plt.xticks([x for x in range(len(label_list))], label_list,  fontsize=20)
    i=i+1

means=merged_topic[label_list].mean()
    
#plt.bar([x-.5 for x in range(len(label_list))], means, width=barwidth, align='center', alpha=0.5)
plt.legend(  range(0,k))
plt.show()

In [None]:
k_cluster_means=merged_topic[label_list+['k_cluster','id_str']].groupby('k_cluster').agg({'troll':'mean' ,'train':'mean' ,'baby':'mean' ,'bot':'mean' , 'flood':'mean','first':'mean','verified':'mean','egg':'mean','id_str':'count'}).rename(columns={'id_str':'tweet_count'})

In [None]:
k_cluster_means.sort_values('flood')

In [None]:
#LDA Cross-Section Visualization

plt.figure(figsize=(20,10))
barwidth=.9/k
for i in range(0,k):

    means=merged_topic[merged_topic['dominant_topic']==i][label_list].mean()
    

    plt.bar([x-.5 + i*barwidth for x in range(len(label_list))], means, width=barwidth, align='center', alpha=0.5)
    plt.xticks([x for x in range(len(label_list))], label_list,  fontsize=20)
    i=i+1

means=dataset[label_list].mean()
    
#plt.bar([x-.5 for x in range(len(label_list))], means, width=barwidth, align='center', alpha=0.5)
plt.legend(  range(0,k))
plt.show()

In [None]:
lda_means=merged_topic[label_list+['dominant_topic','id_str']].groupby('dominant_topic').agg({'troll':'mean' ,'train':'mean' ,'baby':'mean' ,'bot':'mean' , 'flood':'mean','first':'mean','verified':'mean','egg':'mean','id_str':'count'}).rename(columns={'id_str':'tweet_count'})

In [None]:
lda_means.sort_values('flood')

## 7.  Hunt Graphs for Accounts from Suspicious Topics 

In [None]:
# For Wordclouds

from wordcloud import WordCloud,STOPWORDS

stopwords= set(STOPWORDS)

new_stopwords=stopwords.union({'RT\'','Follow\'', 'https','co','amp'})
wc = WordCloud(width=800, 
               height=400,
               max_font_size=100, 
               max_words=40,
               stopwords=new_stopwords,
               background_color='black',
               normalize_plurals= True,
              collocations=False)

### 7.A Specify target by terms and time 

In [None]:
#options for indicating subset of topic to target

week_target=merged_topic['week']==33.0
day_target=merged_topic['date']=='2020-06-05'
text_target=(merged_topic['text'].str.contains('mail',na=False, case=False))|(merged_topic['text'].str.contains('absentee',na=False, case=False))
cluster_target=(merged_topic['k_cluster']==16)
type_target=(merged_topic['any'])&~(merged_topic['verified'])

#aggregation of conditions to specify final target as mask of topic_narrow
target=text_target&type_target


In [None]:
### Quick glance at targeted tweets

#print(topic_narrow[target].shape)
#topic_narrow[target][['user.screen_name','text']].head()

merged_topic[target][['text']].value_counts().head(5)

### 7.B Collect Info on Targeted Accounts and Make Hunt Graphs 

In [None]:
# Collect screen_names of users in target (Maybe.. change this to userid, since its better for tracking the account)

#timing_names=topic_narrow[target].drop_duplicates(subset=['user.screen_name'])['user.screen_name'].reset_index(drop=True)
timing_names=merged_topic[target&(merged_topic['user.followers_count']>100)].drop_duplicates(subset=['user.screen_name'])['user.screen_name'].reset_index(drop=True)


In [None]:
timing_names= pd.Series(list(set(timing_names_2) & set(timing_names_1)))

In [None]:
#print(timing_names_1.shape)
#print(timing_names_2.shape)
print(timing_names.shape)

In [None]:
#Gather 200 most recent tweets from all accounts in target

timing_tweets=[]
for timing_name in timing_names:
    try: 
            timing_tweets.extend(twitter_api.user_timeline(screen_name=timing_name, count=200))
    except tweepy.TweepError:
            print('Missing: '+timing_name)
    
json_data2 = [r._json for r in timing_tweets]
timing_tweets_pd = pd.json_normalize(json_data2)

In [None]:
## Calculate stats for visualization

timing_tweets_pd['date_time']=pd.to_datetime(timing_tweets_pd['created_at'], format="%a %b %d %H:%M:%S +0000 %Y")
timing_tweets_pd['date']=timing_tweets_pd['date_time'].dt.normalize()
#timing_tweets_pd['month']=timing_tweets_pd['date_time'].dt.month
#timing_tweets_pd['week']=timing_tweets_pd['date_time'].dt.week
#timing_tweets_pd['year']=timing_tweets_pd['date_time'].dt.year
timing_tweets_pd['hour']=timing_tweets_pd['date_time'].dt.hour
timing_tweets_pd['hashtags']=timing_tweets_pd['text'].apply(get_hashtags)
timing_tweets_pd['birthday']=pd.to_datetime(timing_tweets_pd['user.created_at'], format="%a %b %d %H:%M:%S +0000 %Y")

In [None]:
#collapse back down to list of accounts for interation/account-level info presentation

timing_tweets_list=timing_tweets_pd.groupby('user.screen_name')[['date','birthday','user.followers_count','user.friends_count']].agg(first_date=('date','min'), last_date=('date','max'),birthday=('birthday','first'),following_count=('user.friends_count','first'),follower_count=('user.followers_count','first'),tweet_count=('date','count')).reset_index()
timing_tweets_list.sort_values('birthday', inplace=True)
timing_tweets_list['days']=(timing_tweets_list['last_date']-timing_tweets_list['first_date'])/np.timedelta64(1, 'D')+1

In [None]:
# Create distribution across hour of day and client, present hashtags, and some account state
# 100 Accounts per file
# I would love to make this a pdf with clickable links to account, but I can't figure out how 
#

hours=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24]

for j in range(0,len(timing_tweets_list['user.screen_name']),100):
    account_list=timing_tweets_list['user.screen_name'][j:j+100].reset_index(drop=True)
    days=timing_tweets_list['days'][j:j+100].reset_index(drop=True)
    tweets=timing_tweets_list['tweet_count'][j:j+100].reset_index(drop=True)
    fols=timing_tweets_list['follower_count'][j:j+100].reset_index(drop=True)
    
    #description_list=timing_tweets_pd[['user.screen_name','user.description']].drop_duplicates()['user.description'][j:j+100].reset_index(drop=True)
    fig, ax=plt.subplots(len(account_list),2,figsize=(10,(4*len(account_list))))
    
    for i in range(0,len(account_list)):
        for type in (timing_tweets_pd[timing_tweets_pd['user.screen_name']==account_list[i]]['source'].drop_duplicates()):
            if type=='<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>':
                ax[i,0].hist(timing_tweets_pd[(timing_tweets_pd['user.screen_name']==account_list[i])&(timing_tweets_pd['source']==type)]['hour'], hours ,alpha=.7, label=type, color='red')
            elif type=='<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>': 
                ax[i,0].hist(timing_tweets_pd[(timing_tweets_pd['user.screen_name']==account_list[i])&(timing_tweets_pd['source']==type)]['hour'], hours ,alpha=.7, label=type, color='skyblue')
            elif type=='<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>': 
                ax[i,0].hist(timing_tweets_pd[(timing_tweets_pd['user.screen_name']==account_list[i])&(timing_tweets_pd['source']==type)]['hour'], hours ,alpha=.7, label=type, color='orange')
            elif type=='<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>': 
                ax[i,0].hist(timing_tweets_pd[(timing_tweets_pd['user.screen_name']==account_list[i])&(timing_tweets_pd['source']==type)]['hour'], hours ,alpha=.7, label=type, color='lightgreen')
            elif type=='<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>': 
                ax[i,0].hist(timing_tweets_pd[(timing_tweets_pd['user.screen_name']==account_list[i])&(timing_tweets_pd['source']==type)]['hour'], hours ,alpha=.7, label=type, color='yellow')
            else:
                ax[i,0].hist(timing_tweets_pd[(timing_tweets_pd['user.screen_name']==account_list[i])&(timing_tweets_pd['source']==type)]['hour'], hours ,alpha=.7, label=type, color='grey')
            ax[i,0].legend()
            ax[i,0].set_title(account_list[i], fontsize=12)
           # plt.title(r"\href{http://www.google.com}{This title links to google}", color='gray')

        Hashtag_Combined = " ".join(timing_tweets_pd[(timing_tweets_pd['user.screen_name']==account_list[i])&(timing_tweets_pd['hashtags'].astype(str)!='[]')]['hashtags'].astype(str))
        
        if Hashtag_Combined!='':
            try:
                wc.generate(Hashtag_Combined)
                ax[i,1].imshow(wc, interpolation="bilinear")
                ax[i,1].axis("off")
            except ValueError:
                print('No Hash')
        ax[i,1].set_title('D: '+str(days[i])+" T: "+str(tweets[i])+" F: "+str(fols[i]), fontsize=12)
      
        
        
    plt.savefig(stem+'_mail_baseline_'+str(j)+'.png')
        