In [None]:
import re
import sys

import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import twint

In [2]:
%load_ext autoreload
%autoreload 1

sys.path.append("../../src/")

from d01_data.data_processing import *

%aimport d01_data.data_processing

#### Process raw Russian tweets data into `RightTroll` subset

In [None]:
process_files_into_right_trolls(13, chunksize=50000)

In [None]:
new_filepath = '../../data/02_intermediate/right_trolls_all.csv'

create_combined_right_troll_dataframe(new_filepath)

In [None]:
114810 + 47357 + 98768 + 72548 + 34388 + 85589 + 71943 + 25353 + 37591 + 32445 + 31483 + 53712 + 5681  # check looks good

#### Read in `right_trolls_all.csv`

In [3]:
right_trolls = pd.read_csv('../../data/02_intermediate/right_trolls_all.csv')
start_shape = right_trolls.shape
print(start_shape)
right_trolls.head(3)

(711668, 21)


Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,...,account_type,retweet,account_category,new_june_2018,alt_external_id,tweet_id,article_url,tco1_step1,tco2_step1,tco3_step1
0,906000000000000000,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,10/1/2017 19:58,10/1/2017 19:59,1052,9636,253,...,Right,0,RightTroll,0,905874659358453760,914580356430536707,http://twitter.com/905874659358453760/statuses...,https://twitter.com/10_gop/status/914580356430...,,
1,906000000000000000,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,10/1/2017 22:43,10/1/2017 22:43,1054,9637,254,...,Right,0,RightTroll,0,905874659358453760,914621840496189440,http://twitter.com/905874659358453760/statuses...,https://twitter.com/damienwoody/status/9145685...,,
2,906000000000000000,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,10/1/2017 22:50,10/1/2017 22:51,1054,9637,255,...,Right,1,RightTroll,0,905874659358453760,914623490375979008,http://twitter.com/905874659358453760/statuses...,https://twitter.com/10_gop/status/913231923715...,,


#### Convert `publish_date` to `datetime`

In [6]:
right_trolls['publish_date'] = pd.to_datetime(right_trolls.publish_date)

Examine date range

In [7]:
right_trolls.publish_date.min()

Timestamp('2014-11-25 08:46:00')

In [8]:
right_trolls.publish_date.max()

Timestamp('2018-03-22 18:13:00')

#### Filter out retweets

In [9]:
right_trolls = right_trolls[right_trolls.retweet == 0]
print('Filtered out', start_shape[0] - right_trolls.shape[0], 'rows')
right_trolls.shape

Filtered out 309263 rows


(402405, 21)

#### Filter to English

In [10]:
right_trolls = right_trolls[right_trolls.language == 'English']
print('Filtered out', 402405 - right_trolls.shape[0], 'rows')
right_trolls.shape

Filtered out 2555 rows


(399850, 21)

#### Filter down to features of interest

In [11]:
features_to_keep = [
    'author', 'content', 'region', 'publish_date', 'following', 
    'followers', 'updates'
]

right_trolls = right_trolls[features_to_keep]
right_trolls.shape

(399850, 7)

#### Create a new column containing hashtags

In [19]:
pattern = re.compile(r'(#\w*)')
right_trolls['hashtags'] = right_trolls.content.apply(lambda x: re.findall(pattern, x))

In [20]:
right_trolls['hashtags_count'] = right_trolls.hashtags.apply(lambda x: len(x))

In [22]:
right_trolls.hashtags_count.value_counts()

0     243122
1      75241
2      59643
3      11607
4       4556
5       2300
6       1218
8        881
7        724
9        274
10       153
11        88
12        31
14         6
13         3
16         1
15         1
23         1
Name: hashtags_count, dtype: int64

#### Look at different author-level characteristics

In [23]:
right_trolls.author.nunique()

622

In [None]:
regex = re.compile(r'(#\w*)')
print(test_df.content[243865])
re.findall(regex, test_df.content[243865])

In [None]:
example = 'this has multiple hashtags #hashtag1 #hashtag2 la la la'
re.findall(regex, example)

In [None]:
regex = re.compile(r'#(\w*)')
test_df['hashtags'] = test_df.content.apply(lambda x: re.findall(regex, x))

In [None]:
test_df['hashtags_count'] = test_df.hashtags.apply(lambda x: len(x))

In [None]:
test_df[test_df.hashtags_count > 0][['content', 'hashtags']].head()

In [None]:
test_df.content[121]

In [None]:
test_df.account_type.value_counts()

In [None]:
test_df.account_category.value_counts()

Maybe filter on tweets that are just "right trolls" and have hashtags?

In [None]:
test_df[(test_df.account_category == 'RightTroll')].shape  # & (test_df.hashtags_count> 0)].shape#[['content', 'hashtags']]

Filter on `RightTroll` accounts

In [None]:
right_trolls = test_df[test_df.account_category == 'RightTroll']
print(right_trolls.shape)
right_trolls.head(2)

#### See what's hogging memory

In [13]:
# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('right_trolls', 124709176),
 ('pattern', 152),
 ('example', 80),
 ('create_combined_right_troll_dataframe', 72),
 ('filter_dataframe_to_right_trolls', 72),
 ('load_pickle', 72),
 ('process_files_into_right_trolls', 72),
 ('save_pickle', 72),
 ('features_to_keep', 64),
 ('np', 44),
 ('pd', 44),
 ('plt', 44),
 ('start_shape', 36)]

In [None]:
# del(right_trolls)

In [None]:
bots_tweets.publish_date.min()

In [None]:
bots_tweets.publish_date.max()