In [None]:
import http
from http import client
import csv
import re
import datetime
import numpy as np
from math import floor

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
from matplotlib.gridspec import GridSpec
import random
%matplotlib inline

In [None]:
SMALL_SIZE = 10
MEDIUM_SIZE = 18
BIGGER_SIZE = 24

plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
#plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title


In [None]:
plt.rcParams.keys()

In [None]:
import pandas as pd


In [None]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)

In [None]:
def get_hashtags(s):
    return [i  for i in s.split() if i.startswith("#") ]

In [None]:
from wordcloud import WordCloud,STOPWORDS

stopwords= set(STOPWORDS)

new_stopwords=stopwords.union({'RT\'','Follow\'', 'https','co','amp'})
wc = WordCloud(width=800, 
               height=400,
               max_font_size=100, 
               max_words=40,
               stopwords=new_stopwords,
               background_color='black',
               normalize_plurals= True,
              collocations=False)

## Load Data

In [None]:
#/home/pwarren/china/hashed_2020_05_china_052020_china_052020_tweets_csv_hashed.zip


In [None]:
cd /scratch2/pwarren/accounts/

In [None]:
stem='zeinaab_network'
dates=['birth_Sep27','2_birth_Oct1']

In [None]:
#cd /home/pwarren/local_trolls/Russia_releases/June_2020/
#stem='russia_052020_tweets_csv_hashed'

In [None]:
sandbox=pd.read_csv('/home/pwarren/china/hashed_2020_05_china_052020_china_052020_tweets_csv_hashed.zip')

In [None]:
dates=['1','2']

In [None]:
sandbox=pd.DataFrame()
for date in dates:
    sandbox=sandbox.append(pd.read_csv(stem+'_'+date+'.csv'))

In [None]:
sandbox=pd.read_csv(stem+'_full.csv')

In [None]:
years = mdates.YearLocator()   # every year
months = mdates.MonthLocator()  # every month
days = mdates.DayLocator()  # every month
hours = mdates.HourLocator()  # every month
yearsFmt = mdates.DateFormatter('%Y')

In [None]:
sandbox.shape

In [None]:
sandbox['lower_hashtags']=sandbox['CONTENT'].astype(str).replace('\,|\.', '',regex=True).apply(get_hashtags)

In [None]:
sandbox['datetime']=pd.to_datetime(sandbox['tweet_time'], format="%Y-%m-%d %H:%M")
sandbox['date']=sandbox['datetime'].dt.normalize()


In [None]:
sandbox['is_reply']=sandbox['in_reply_to_userid'].notnull()
sandbox['is_ext_rt']=(sandbox['retweet_userid'].isnull())&(sandbox['is_retweet'])
sandbox['int_rt']=(sandbox['retweet_userid'].notnull())&(sandbox['is_retweet'])


### Account-level facts, like Language and Birth

In [None]:
sandbox['tweet_language'].value_counts().head(10)

In [None]:
top_lang=sandbox['tweet_language'].value_counts().index[0]

In [None]:
sandbox['is_english']=sandbox['tweet_language']=='en'
sandbox['is_top_lang']=sandbox['tweet_language']==top_lang


In [None]:
sandbox.head(1)

In [None]:
frac_eng_birth=sandbox.groupby('userid').agg({'is_english':'mean','date':'min','is_top_lang':'mean','tweetid':'count'}).rename(columns={'is_english':'pct_english','date':'birthday','is_top_lang':'pct_top_lang','tweetid':'tweet_count'})
sandbox=sandbox.merge(frac_eng_birth ,on='userid', how='left')

In [None]:
sandbox['days_from_birth']=((sandbox['date']-sandbox['birthday'])/np.timedelta64(1, 'D'))


In [None]:
sandbox['days_from_birth'].hist()

In [None]:
frac_eng_birth['pct_english'].hist()

In [None]:
frac_eng_birth['pct_top_lang'].hist()

In [None]:
frac_eng_birth['birthday'].hist(bins=100)


In [None]:
sandbox.index=sandbox['datetime']

### Creation and Output over Time

In [None]:
fig, ax=plt.subplots(3,2,figsize=(15,10))

ax[0,0].hist(frac_eng_birth['birthday'], bins=floor(len(frac_eng_birth['birthday'])/10), color='black')
ax[0,1].hist(frac_eng_birth[frac_eng_birth['pct_top_lang']>.6]['birthday'] , bins=floor(len(frac_eng_birth['birthday'])/10), color='red')
ax[1,0].hist(frac_eng_birth[frac_eng_birth['pct_english']>.2]['birthday'], bins=floor(len(frac_eng_birth['birthday'])/10), color='blue')
ax[1,1].hist(frac_eng_birth[(frac_eng_birth['pct_english']<.2)&(frac_eng_birth['pct_top_lang']<.6)]['birthday'], bins=floor(len(frac_eng_birth['birthday'])/10) ,color='green')
ax[2,0].hist(frac_eng_birth[(frac_eng_birth['tweet_count']<100)]['birthday'], bins=floor(len(frac_eng_birth['birthday'])/10), color='lightgrey')
ax[2,1].hist(frac_eng_birth[(frac_eng_birth['tweet_count']>1000)]['birthday'], bins=floor(len(frac_eng_birth['birthday'])/10), color='grey')

ax[0,0].set_title('First Tweet Date', fontsize=14)
ax[0,1].set_title('First Tweet Date, Top Language Accounts', fontsize=14)
ax[1,0].set_title('First Tweet Date, English Accounts', fontsize=14)
ax[1,1].set_title('First Tweet Date, Other Language Accounts', fontsize=14)
ax[2,0].set_title('First Tweet Date, <100 Tweet Accounts', fontsize=14)
ax[2,1].set_title('First Tweet Date, >1000 Tweet Accounts', fontsize=14)

for i in range(0,2):
    for j in range(0,3):
        ax[j,i].xaxis.set_major_locator(years)
        ax[j,i].xaxis.set_minor_locator(months)

        ax[j,i].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
        #ax[j,i].xaxis.set_minor_formatter(mdates.DateFormatter('%m'))


In [None]:
graphtemp_en=sandbox[sandbox['tweet_language']=='en'][['tweetid','tweet_language']].resample('W').agg({'tweetid':'count'})
graphtemp_zh=sandbox[sandbox['tweet_language']==top_lang][['tweetid','tweet_language']].resample('W').agg({'tweetid':'count'})
graphtemp_und=sandbox[sandbox['pct_english']>.2][['tweetid','tweet_language']].resample('W').agg({'tweetid':'count'})


fig, ax=plt.subplots(figsize=(15,6))

ax.plot(graphtemp_en,alpha=.7,color='blue')
ax.plot(graphtemp_zh,alpha=.7,color='red')
ax.plot(graphtemp_und,alpha=.7,color='grey')


ax.xaxis.set_major_locator(years)
ax.xaxis.set_minor_locator(months)

ax.xaxis.set_major_formatter(mdates.DateFormatter('\n%Y'))
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))


ax.tick_params(axis='both', which='major', labelsize=20)
plt.legend(['English',top_lang, '>20% Eng'], fontsize='medium')
plt.title('Total Weekly Output')
plt.show()

In [None]:
graphtemp_en=sandbox[sandbox['tweet_language']=='en'][['tweetid','tweet_language']].resample('W').agg({'tweetid':'count'})
#graphtemp_zh=sandbox[sandbox['tweet_language']=='tr'][['tweetid','tweet_language']].resample('W').agg({'tweetid':'count'})
#graphtemp_und=sandbox[sandbox['pct_english']>.2][['tweetid','tweet_language']].resample('W').agg({'tweetid':'count'})


fig, ax=plt.subplots(figsize=(15,6))

ax.plot(graphtemp_en,alpha=.7,color='blue')
#ax.plot(graphtemp_zh,alpha=.7,color='red')
#ax.plot(graphtemp_und,alpha=.7,color='grey')


ax.xaxis.set_major_locator(years)
ax.xaxis.set_minor_locator(months)

ax.xaxis.set_major_formatter(mdates.DateFormatter('\n%Y'))
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))


ax.tick_params(axis='both', which='major', labelsize=20)
plt.legend(['English','tr', '>20% Eng'], fontsize='medium')
plt.title('Total Weekly Output')
plt.show()

In [None]:
#graphtemp_en=sandbox[(sandbox['tweet_language']=='en')&(sandbox['datetime']>'2019-01-01')][['tweetid','tweet_language']].resample('D').agg({'tweetid':'count'})
graphtemp_zh=sandbox[(sandbox['tweet_language']==top_lang)&(sandbox['datetime']>'2019-01-01')][['tweetid','tweet_language']].resample('D').agg({'tweetid':'count'})
#graphtemp_und=sandbox[(sandbox['pct_english']>.2)&(sandbox['datetime']>'2020-01-01')][['tweetid','tweet_language']].resample('D').agg({'tweetid':'count'})


fig, ax=plt.subplots(figsize=(15,6))

#ax.plot(graphtemp_en,alpha=.7,color='blue')
ax.plot(graphtemp_zh,alpha=.7,color='red')
#ax.plot(graphtemp_und,alpha=.7,color='grey')


ax.xaxis.set_major_locator(years)
ax.xaxis.set_minor_locator(months)

ax.xaxis.set_major_formatter(mdates.DateFormatter('\n%Y'))
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))


ax.tick_params(axis='both', which='major', labelsize=20)
plt.legend([top_lang], fontsize='medium')
plt.title('Total Daily Output- Zoom in ')
plt.show()

In [None]:
graphtemp_en=sandbox[(sandbox['tweet_language']=='en')&(sandbox['datetime']>'2019-01-01')][['tweetid','tweet_language']].resample('D').agg({'tweetid':'count'})
#graphtemp_zh=sandbox[(sandbox['tweet_language']=='tr')&(sandbox['datetime']>'2020-01-01')][['tweetid','tweet_language']].resample('D').agg({'tweetid':'count'})
#graphtemp_und=sandbox[(sandbox['pct_english']>.2)&(sandbox['datetime']>'2020-01-01')][['tweetid','tweet_language']].resample('D').agg({'tweetid':'count'})


fig, ax=plt.subplots(figsize=(15,6))

ax.plot(graphtemp_en,alpha=.7,color='blue')
#ax.plot(graphtemp_zh,alpha=.7,color='red')
#ax.plot(graphtemp_und,alpha=.7,color='grey')


ax.xaxis.set_major_locator(years)
ax.xaxis.set_minor_locator(months)

ax.xaxis.set_major_formatter(mdates.DateFormatter('\n%Y'))
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))


ax.tick_params(axis='both', which='major', labelsize=20)
plt.legend(['English','tr', '>20% Eng'], fontsize='medium')
plt.title('Total Daily Output- Zoom in ')
plt.show()

In [None]:
graphtemp_en=sandbox[sandbox['tweet_language']=='en'][['int_rt','tweet_language']].resample('W').agg({'int_rt':'mean'})
graphtemp_zh=sandbox[sandbox['tweet_language']==top_lang][['int_rt','tweet_language']].resample('W').agg({'int_rt':'mean'})
graphtemp_und=sandbox[(sandbox['pct_english']>.2)][['int_rt','tweet_language']].resample('W').agg({'int_rt':'mean'})


fig, ax=plt.subplots(figsize=(15,6))

ax.plot(graphtemp_en,alpha=.7,color='blue')
ax.plot(graphtemp_zh,alpha=.7,color='red')
ax.plot(graphtemp_und,alpha=.7,color='grey')


ax.xaxis.set_major_locator(years)
ax.xaxis.set_minor_locator(months)

ax.xaxis.set_major_formatter(mdates.DateFormatter('\n%Y'))
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))


ax.tick_params(axis='both', which='major', labelsize=20)
plt.legend(['English',top_lang, '>20% Eng'], fontsize='medium')
plt.title('Internal RT')
plt.show()

In [None]:
graphtemp_en=sandbox[sandbox['tweet_language']=='en'][['is_reply','tweet_language']].resample('W').agg({'is_reply':'mean'})
graphtemp_zh=sandbox[sandbox['tweet_language']==top_lang][['is_reply','tweet_language']].resample('W').agg({'is_reply':'mean'})
graphtemp_und=sandbox[(sandbox['pct_english']>.2)][['is_reply','tweet_language']].resample('W').agg({'is_reply':'mean'})


fig, ax=plt.subplots(figsize=(15,6))

ax.plot(graphtemp_en,alpha=.7,color='blue')
ax.plot(graphtemp_zh,alpha=.7,color='red')
ax.plot(graphtemp_und,alpha=.7,color='grey')


ax.xaxis.set_major_locator(years)
ax.xaxis.set_minor_locator(months)

ax.xaxis.set_major_formatter(mdates.DateFormatter('\n%Y'))
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))


ax.tick_params(axis='both', which='major', labelsize=20)
plt.legend(['English',top_lang, '>20%'], fontsize='medium')
plt.title('Reply Share')
plt.show()

In [None]:
graphtemp_en=sandbox[sandbox['tweet_language']=='en'][['is_ext_rt','tweet_language']].resample('W').agg({'is_ext_rt':'mean'})
graphtemp_zh=sandbox[sandbox['tweet_language']==top_lang][['is_ext_rt','tweet_language']].resample('W').agg({'is_ext_rt':'mean'})
graphtemp_und=sandbox[(sandbox['pct_english']>.2)][['is_ext_rt','tweet_language']].resample('W').agg({'is_ext_rt':'mean'})


fig, ax=plt.subplots(figsize=(15,6))

ax.plot(graphtemp_en,alpha=.7,color='blue')
ax.plot(graphtemp_zh,alpha=.7,color='red')
ax.plot(graphtemp_und,alpha=.7,color='grey')


ax.xaxis.set_major_locator(years)
ax.xaxis.set_minor_locator(months)

ax.xaxis.set_major_formatter(mdates.DateFormatter('\n%Y'))
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))


ax.tick_params(axis='both', which='major', labelsize=20)
plt.legend(['English',top_lang, '>20%'], fontsize='medium')
plt.title('(Maybe) External Retweet Share')


plt.show()

In [None]:
sandbox.columns

In [None]:
sandbox[sandbox['tweet_language']=='und']['account_language'].value_counts()

In [None]:
from wordcloud import WordCloud,STOPWORDS

stopwords= set(STOPWORDS)

new_stopwords=stopwords.union({'RT\'','Follow\'', 'https','co','amp'})
wc = WordCloud(width=800, 
               height=400,
               max_font_size=100, 
               max_words=40,
               stopwords=new_stopwords,
               background_color='black',
               normalize_plurals= True,
              collocations=False)

Hashtag_Combined = " ".join(sandbox['lower_hashtags'].astype(str))

wc.generate(Hashtag_Combined)
plt.figure(figsize=(15,15))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title('Dominant Hashtags')

#Show the wordcloud
plt.show()

In [None]:
Hashtag_Combined = " ".join(sandbox[sandbox['tweet_language']=='en']['hashtags'].astype(str))

wc.generate(Hashtag_Combined)
plt.figure(figsize=(15,15))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")

plt.title('Dominant Hashtags: Tweets Tagged English')

#Show the wordcloud
plt.show()

In [None]:
Hashtag_Combined = " ".join(sandbox[sandbox['tweet_language']==top_lang]['hashtags'].astype(str))

wc.generate(Hashtag_Combined)
plt.figure(figsize=(15,15))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")

plt.title('Dominant Hashtags: Tweets Tagged '+top_lang)

#Show the wordcloud
plt.show()

In [None]:
Hashtag_Combined = " ".join(sandbox[sandbox['pct_english']>.2]['hashtags'].astype(str))

wc.generate(Hashtag_Combined)
plt.figure(figsize=(15,15))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")

#Show the wordcloud
plt.title('Dominant Hashtags: Accounts with >20% Tweets Tagged English')

plt.show()

## Hour and Day of Week

In [None]:
sandbox['weekday']=sandbox['datetime'].dt.weekday
sandbox['hour']=sandbox['datetime'].dt.hour

In [None]:
graphtemp_en=sandbox[sandbox['tweet_language']=='en'][['tweetid','tweet_language','weekday']].groupby('weekday').count()['tweetid']


fig, ax=plt.subplots(figsize=(15,6))

plt.plot(graphtemp_en,alpha=.7,color='black')


#ax.set_ylim([0,7000])


ax.tick_params(axis='both', which='major', labelsize=20)
ax.tick_params(axis='both', which='minor', labelsize=12)


plt.title('English Output by Day of Week , Monday=0')
plt.legend(['en', 'zh','>20% en'])
plt.show()

In [None]:
graphtemp_zh=sandbox[sandbox['tweet_language']==top_lang][['tweetid','tweet_language','weekday']].groupby('weekday').count()['tweetid']


fig, ax=plt.subplots(figsize=(15,6))

#plt.plot(graphtemp_en,alpha=.7,color='black')
plt.plot(graphtemp_zh,alpha=.7,color='red')
#plt.plot(graphtemp_und,alpha=.7,color='grey')


#ax.set_ylim([0,70000])


ax.tick_params(axis='both', which='major', labelsize=20)
ax.tick_params(axis='both', which='minor', labelsize=12)


plt.title(top_lang+' Output by Day of Week , Monday=0')
plt.legend(['tr'])
plt.show()

In [None]:
graphtemp_en=sandbox[sandbox['tweet_language']=='en'][['tweetid','tweet_language','hour']].groupby('hour').count()['tweetid']



fig, ax=plt.subplots(figsize=(15,6))

plt.plot(graphtemp_en,alpha=.7,color='black')
#plt.plot(graphtemp_eh,alpha=.7,color='red')
#plt.plot(graphtemp_und,alpha=.7,color='grey')




ax.tick_params(axis='both', which='major', labelsize=20)
ax.tick_params(axis='both', which='minor', labelsize=12)


plt.title(' English Output by Hour of Day , UTC')

plt.show()

In [None]:

graphtemp_zh=sandbox[sandbox['tweet_language']==top_lang][['tweetid','tweet_language','hour']].groupby('hour').count()['tweetid']




fig, ax=plt.subplots(figsize=(15,6))

#plt.plot(graphtemp_en,alpha=.7,color='black')
plt.plot(graphtemp_zh,alpha=.7,color='red')
#plt.plot(graphtemp_und,alpha=.7,color='grey')


#ax.set_ylim([0,4000])


ax.tick_params(axis='both', which='major', labelsize=20)
ax.tick_params(axis='both', which='minor', labelsize=12)


plt.title(top_lang+' Output by Hour of Day , UTC')

plt.show()

In [None]:
sandbox.columns

In [None]:
sandbox[sandbox['tweet_language']=='en']['tweet_client_name'].value_counts().head(10)

In [None]:
sandbox[sandbox['tweet_language']==top_lang]['tweet_client_name'].value_counts().head(10)

In [None]:
sandbox[sandbox['tweet_language']=='und']['tweet_client_name'].value_counts().head(10)

In [None]:
lang_top_client={}
for i in range(0,4):
    lang_top_client[i]=sandbox[sandbox['tweet_language']==top_lang]['tweet_client_name'].value_counts().index[i]


In [None]:
eng_top_client={}
for i in range(0,4):
    eng_top_client[i]=sandbox[sandbox['tweet_language']=='en']['tweet_client_name'].value_counts().index[i]


In [None]:
sandbox['tweet_language'].value_counts().index[0]

In [None]:
graphtemp={}
for i in range(0,len(eng_top_client)):
    graphtemp[i]=sandbox[(sandbox['tweet_language']=='en')&((sandbox['tweet_client_name']==eng_top_client[i]))][['tweetid','tweet_language','hour']].groupby('hour').count()['tweetid']



fig, ax=plt.subplots(figsize=(15,6))
for i in range(0,len(eng_top_client)):
    ax.plot(graphtemp[i],alpha=.7)


#ax.set_ylim([0,4000])


ax.tick_params(axis='both', which='major', labelsize=20)
ax.tick_params(axis='both', which='minor', labelsize=12)

ax.legend(eng_top_client.values())
plt.title('Client Usage, English Langauge Output by Hour of Day , UTC')

plt.show()

In [None]:
graphtemp={}
for i in range(0,len(lang_top_client)):
    graphtemp[i]=sandbox[(sandbox['tweet_language']==top_lang)&((sandbox['tweet_client_name']==lang_top_client[i]))][['tweetid','tweet_language','hour']].groupby('hour').count()['tweetid']



fig, ax=plt.subplots(figsize=(15,6))
for i in range(0,len(lang_top_client)):
    ax.plot(graphtemp[i],alpha=.7)


#ax.set_ylim([0,4000])


ax.tick_params(axis='both', which='major', labelsize=20)
ax.tick_params(axis='both', which='minor', labelsize=12)

ax.legend(lang_top_client.values())
plt.title('Client Usage, Top Langauge Output by Hour of Day , UTC')

plt.show()

### Lifecycle

In [None]:
graphtemp_en=sandbox[(sandbox['tweet_language']=='en')&(sandbox['days_from_birth']<200)][['int_rt','tweet_language','days_from_birth']].groupby('days_from_birth').agg({'int_rt':'mean'})
graphtemp_zh=sandbox[(sandbox['tweet_language']==top_lang)&(sandbox['days_from_birth']<200)][['int_rt','tweet_language','days_from_birth']].groupby('days_from_birth').agg({'int_rt':'mean'})
graphtemp_und=sandbox[(sandbox['pct_english']>.2)&(sandbox['days_from_birth']<200)][['int_rt','tweet_language','days_from_birth']].groupby('days_from_birth').agg({'int_rt':'mean'})


fig, ax=plt.subplots(figsize=(15,6))

ax.plot(graphtemp_en,alpha=.7,color='blue')
ax.plot(graphtemp_zh,alpha=.7,color='red')
ax.plot(graphtemp_und,alpha=.7,color='grey')


#ax.xaxis.set_major_locator(years)
#ax.xaxis.set_minor_locator(months)

#ax.xaxis.set_major_formatter(mdates.DateFormatter('\n%Y'))
#ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))


ax.tick_params(axis='both', which='major', labelsize=20)
plt.legend(['English',top_lang, '>20% Eng'], fontsize='medium')
plt.title('Internal RT')
plt.show()

In [None]:
graphtemp_en=sandbox[(sandbox['tweet_language']=='en')&(sandbox['days_from_birth']<200)][['is_ext_rt','tweet_language','days_from_birth']].groupby('days_from_birth').agg({'is_ext_rt':'mean'})
graphtemp_zh=sandbox[(sandbox['tweet_language']==top_lang)&(sandbox['days_from_birth']<200)][['is_ext_rt','tweet_language','days_from_birth']].groupby('days_from_birth').agg({'is_ext_rt':'mean'})
graphtemp_und=sandbox[(sandbox['pct_english']>.2)&(sandbox['days_from_birth']<200)][['is_ext_rt','tweet_language','days_from_birth']].groupby('days_from_birth').agg({'is_ext_rt':'mean'})


fig, ax=plt.subplots(figsize=(15,6))

ax.plot(graphtemp_en,alpha=.7,color='blue')
ax.plot(graphtemp_zh,alpha=.7,color='red')
ax.plot(graphtemp_und,alpha=.7,color='grey')


#ax.xaxis.set_major_locator(years)
#ax.xaxis.set_minor_locator(months)

#ax.xaxis.set_major_formatter(mdates.DateFormatter('\n%Y'))
#ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))


ax.tick_params(axis='both', which='major', labelsize=20)
plt.legend(['English',top_lang, '>20% Eng'], fontsize='medium')
plt.title('External RT')
plt.show()

In [None]:
graphtemp_en=sandbox[(sandbox['tweet_language']=='en')&(sandbox['days_from_birth']<200)][['is_reply','tweet_language','days_from_birth']].groupby('days_from_birth').agg({'is_reply':'mean'})
graphtemp_zh=sandbox[(sandbox['tweet_language']==top_lang)&(sandbox['days_from_birth']<200)][['is_reply','tweet_language','days_from_birth']].groupby('days_from_birth').agg({'is_reply':'mean'})
graphtemp_und=sandbox[(sandbox['pct_english']>.2)&(sandbox['days_from_birth']<200)][['is_reply','tweet_language','days_from_birth']].groupby('days_from_birth').agg({'is_reply':'mean'})


fig, ax=plt.subplots(figsize=(15,6))

ax.plot(graphtemp_en,alpha=.7,color='blue')
ax.plot(graphtemp_zh,alpha=.7,color='red')
ax.plot(graphtemp_und,alpha=.7,color='grey')


#ax.xaxis.set_major_locator(years)
#ax.xaxis.set_minor_locator(months)

#ax.xaxis.set_major_formatter(mdates.DateFormatter('\n%Y'))
#ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))


ax.tick_params(axis='both', which='major', labelsize=20)
plt.legend(['English',top_lang, '>20% Eng'], fontsize='medium')
plt.title('Reply')
plt.show()

### Our Types

In [None]:
sandbox['second']=sandbox['datetime'].dt.second

sandbox['days_from_birth']=(sandbox['datetime']-sandbox['birthday'])/np.timedelta64(1, 'D')
sandbox['follower_per_update']=sandbox['follower_count'].astype('float')/sandbox['tweet_count'].astype('float')
sandbox['follower_per_friend']=sandbox['follower_count'].astype('float')/sandbox['following_count'].astype('float')
sandbox['tweets_per_day']=sandbox['tweet_count'].astype('float')/sandbox['days_from_birth']
sandbox['short_content']=sandbox['tweet_text'].str.replace(r'http.*', 'http')
sandbox['word_count'] = sandbox['tweet_text'].str.split().str.len()

sandbox['week']=sandbox['datetime'].dt.week
sandbox['year']=sandbox['datetime'].dt.year

sandbox['year_week']=sandbox['year']*100+sandbox['week']

In [None]:
sandbox['second'].value_counts()

In [None]:
repeats=sandbox[sandbox['is_retweet']==False][['tweetid','short_content']].groupby('short_content').agg({'tweetid':'count'}).reset_index().rename(columns={'tweetid':'repeats_count'})
sandbox=sandbox.merge(repeats, how='left', on='short_content')

In [None]:
sandbox.head(1)

In [None]:
sandbox['user_profile_description'].value_counts(dropna=False).head()

In [None]:
sandbox['following_count'].hist()

In [None]:
## Key Tweet/Account Label Definitions:
# Egg -- Default Profile Image
# First -- Tweet occurs in first second of the minute; indicator of automation.
# Flood -- Tweet is from a non-verified user and is an exact duplicate of another tweet and that 
#          tweet is not duplicated by a verified user
# Bot -- Account with more than 320 tweets per day
# Baby -- Account with 100 tweets or fewer
# Train -- Account with follower count within 10% of  following count, at least 1000 friends, and at least 100 tweets
# Troll -- Non-verified user with at least 1000 friends 100 tweets, and "too many" followers per tweet >0.75 . 


sandbox['egg']=(sandbox['user_profile_description'].isnull())&(sandbox['user_reported_location'].isnull())
#sandbox['first']=(sandbox['second']==1)
sandbox['flood']=((sandbox['is_retweet']==False)
                &(sandbox['repeats_count']>2)
                &(sandbox['word_count']>2)
               )

sandbox['repeater']=(sandbox['retweet_count']>sandbox['like_count'])&(sandbox['retweet_count']>5)

sandbox['bot']=sandbox['tweets_per_day']>320
              
sandbox['baby']=(sandbox['tweet_count'].astype('float')<=100) 
            
sandbox['train']=((sandbox['tweet_count'].astype('float')>100) 
            &(sandbox['following_count'].astype('float')>1000) 
            &(sandbox['follower_per_friend']>.9)
             &(sandbox['follower_per_friend']<1.1)
            )
sandbox['troll']=((sandbox['tweet_count'].astype('float')>100) 
            &(sandbox['following_count'].astype('float')>500) 
            &(sandbox['tweet_count'].astype('float')<10000) 
            &(sandbox['follower_per_update']>.75) 
                 )
sandbox['any']=sandbox['egg']|sandbox['flood']|sandbox['bot']|sandbox['baby']|sandbox['train']|sandbox['troll']|sandbox['repeater']
label_list=['troll' ,'train' ,'baby' ,'bot' , 'flood','egg','any','repeater']

In [None]:
group_dict={'tweetid':'count'}
group_dict['date']='first'
for type in label_list:
    group_dict[type]='mean'

In [None]:
weekly_stats=sandbox[label_list+['year_week','tweetid','date']].groupby('year_week').agg(group_dict).rename(columns={'tweetid':'tweet_count'})
weekly_stats['log_count']=np.log10(weekly_stats['tweet_count'])
weekly_stats.index=weekly_stats['date']

In [None]:
#Weekly Version
min_tweet=30

fig, ax=plt.subplots(len(label_list)+1,1,figsize=(20,20))

for i in range(0,len(label_list)):
        ax[i].plot(weekly_stats[(weekly_stats['tweet_count']>min_tweet)][label_list[i]].sort_index(), alpha=.7, label='Overall')
        ax[i].set_title("Share "+label_list[i], fontsize=12)
       #ax[i].legend()
        ax[i].xaxis.set_major_locator(years)
        ax[i].xaxis.set_minor_locator(months)
        ax[i].tick_params(axis='both', which='major', labelsize=10)
        ax[i].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

ax[len(label_list)].plot(weekly_stats['log_count'].sort_index(), alpha=.7, label='Overall')
ax[len(label_list)].xaxis.set_major_locator(years)
ax[len(label_list)].xaxis.set_minor_locator(months)
ax[len(label_list)].tick_params(axis='both', which='major', labelsize=10)
ax[len(label_list)].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax[len(label_list)].set_title('Log10(Tweet Count)', fontsize=12)
ax[0].legend()

fig.tight_layout(pad=.5)
#ax[i].set_ylim([0,.2])
#plt.savefig('stat_timeline.png')

In [None]:
weekly_stats_eng=sandbox[sandbox['pct_english']>.2][label_list+['year_week','tweetid','date']].groupby('year_week').agg(group_dict).rename(columns={'tweetid':'tweet_count'})
weekly_stats_eng['log_count']=np.log10(weekly_stats_eng['tweet_count'])
weekly_stats_eng.index=weekly_stats_eng['date']

In [None]:
#Weekly Version
min_tweet=30

fig, ax=plt.subplots(len(label_list)+1,1,figsize=(20,20))

for i in range(0,len(label_list)):
        ax[i].plot(weekly_stats_eng[(weekly_stats_eng['tweet_count']>min_tweet)][label_list[i]].sort_index(), alpha=.7, label='Overall')
        ax[i].plot(weekly_stats[(weekly_stats['tweet_count']>min_tweet)][label_list[i]].sort_index(), alpha=.7, label='Overall')
        ax[i].set_title("Share "+label_list[i], fontsize=12)
       #ax[i].legend()
        ax[i].xaxis.set_major_locator(years)
        ax[i].xaxis.set_minor_locator(months)
        ax[i].tick_params(axis='both', which='major', labelsize=10)
        ax[i].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

ax[len(label_list)].plot(weekly_stats_eng['log_count'].sort_index(), alpha=.7, label='Overall')
ax[len(label_list)].plot(weekly_stats['log_count'].sort_index(), alpha=.7, label='Overall')
ax[len(label_list)].xaxis.set_major_locator(years)
ax[len(label_list)].xaxis.set_minor_locator(months)
ax[len(label_list)].tick_params(axis='both', which='major', labelsize=10)
ax[len(label_list)].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax[len(label_list)].set_title('Log10(Tweet Count)', fontsize=12)
ax[0].legend(['English','Full'])

fig.tight_layout(pad=.5)
#ax[i].set_ylim([0,.2])
#plt.savefig('stat_timeline.png')

In [None]:
#Package for counting
import collections


# Define the minimum times a hashtag needs to appear in the overall topic in order or on the day to include it in the analysis

min_size_week=50
min_size_day=50

In [None]:
sandbox['lower_text']=sandbox['tweet_text'].str.lower()

In [None]:
sandbox['lower_hashtags']=sandbox['lower_text'].astype(str).replace('\,|\.', '',regex=True).apply(get_hashtags)

In [None]:
hashtags = [x for l in sandbox.lower_hashtags.values for x in l]
#Counter object

counter = collections.Counter(hashtags)


In [None]:
testtemp=counter.most_common(200)
long_convo=[i[0] for i in testtemp if ('#'+stem.lower()!=(str(i[0])))]

In [None]:
testtemp

In [None]:
conversations=long_convo[0:5]

In [None]:
weekly_stats=sandbox[label_list+['year_week','tweetid','date']].groupby('year_week').agg(group_dict).rename(columns={'tweetid':'tweet_count'})
weekly_stats['convo']='Overall'
for convo in conversations:
    convo_screen=sandbox['tweet_text'].str.contains(convo, na=False, case=False)
    convo_stats=sandbox[convo_screen][label_list+['year_week','date','tweetid']].groupby('year_week').agg(group_dict).rename(columns={'tweetid':'tweet_count'})
    convo_stats['convo']=convo
    weekly_stats=weekly_stats.append(convo_stats)

weekly_stats['log_count']=np.log(weekly_stats['tweet_count'])
weekly_stats.index=weekly_stats['date']

In [None]:
#Weekly Version
min_tweet=30

fig, ax=plt.subplots(len(label_list)+1,1,figsize=(20,20))

convo_list=weekly_stats['convo'].drop_duplicates().sort_values()
for i in range(0,len(label_list)):
        for convo in convo_list:
            ax[i].plot(weekly_stats[(weekly_stats['convo']==convo)&(weekly_stats['tweet_count']>min_tweet)][label_list[i]].sort_index(), alpha=.7, label=convo)
        ax[i].set_title("Share "+label_list[i], fontsize=12)
       #ax[i].legend()
        ax[i].xaxis.set_major_locator(years)
        ax[i].xaxis.set_minor_locator(months)
        ax[i].tick_params(axis='both', which='major', labelsize=10)
        ax[i].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

for convo in convo_list:
    ax[len(label_list)].plot(weekly_stats[weekly_stats['convo']==convo]['log_count'].sort_index(), alpha=.7, label=convo)
ax[len(label_list)].xaxis.set_major_locator(years)
ax[len(label_list)].xaxis.set_minor_locator(months)
ax[len(label_list)].tick_params(axis='both', which='major', labelsize=10)
ax[len(label_list)].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax[len(label_list)].set_title('Log(Tweet Count)', fontsize=12)
ax[0].legend()

fig.tight_layout(pad=.5)
#ax[i].set_ylim([0,.2])
#plt.savefig('stat_timeline.png')

In [None]:
sandbox.head(1)