# Targeting group among rumor tweets in five killing events 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk import word_tokenize
import seaborn as sns

In [2]:
import os
path = r'D:\論文\PHEME9\Data\CSV'
import pathlib
# data_temp = sorted(pathlib.Path(path).glob('*-2.csv'))

In [26]:
killing_events = ['charliehebdo','ferguson','germanwings','sydneysiege','ottawashooting']
killing_events.sort()
killing_events

['charliehebdo', 'ferguson', 'germanwings', 'ottawashooting', 'sydneysiege']

In [33]:
killing_data = pd.DataFrame()
col_names = ['tweet_id','user_id','date','src_tweet_id','rumor_label','original_tweet','cleaned_tweet',
               'sentiment','emotion','source_reply','event']

In [34]:
# merge 5 killing events into one dataframe: killing_data

for i in killing_events:
    rep_path = pathlib.Path(path, i+'-df-2.csv')
    src_path = pathlib.Path(path, i+'-src-2.csv')
    rep = pd.read_csv(rep_path, encoding = 'utf-8', header = 0)
    src = pd.read_csv(src_path, encoding = 'utf-8', header =0)
    
    rep = rep[['reply_tweet_id', 'reply_user_id','reply_date','src_tweet_id', 'label', 'reply_tweet',
                         'cleaned_reply_tw', 'reply_sentiment', 'reply_emo_ro']]
    rep['source_reply'] = 'reply'
    rep['event'] = i
    
    rep.columns = col_names
    
    src = src[['src_tweet_id', 'src_user_id', 'src_date','src_tweet_id', 'label','src_tweet', 
                         'cleaned_src_tw','src_sentiment','src_emo_ro']]
    src['source_reply'] = 'source'
    src['event'] = i
    src.columns = col_names
    
    killing_data = pd.concat([killing_data,rep,src])

    print(killing_data.shape[0])

35630
58502
62511
73746
96276


In [40]:
#  rumor tweets in five killing events : rumor_data
rumor_data = killing_data[killing_data.rumor_label==1]
rumor_data.reset_index(inplace=True,drop=True)

In [42]:
rumor_data.date = pd.to_datetime(rumor_data.date)
rumor_data.tweet_id = rumor_data.tweet_id.astype(str)
rumor_data.user_id = rumor_data.user_id.astype(str)

In [44]:
rumor_data.isnull().sum()

tweet_id           0
user_id            0
date               0
src_tweet_id       0
rumor_label        0
original_tweet     0
cleaned_tweet     10
sentiment          0
emotion            0
source_reply       0
event              0
dtype: int64

In [45]:
rumor_data.dropna(inplace=True)
rumor_data.reset_index(drop=True,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rumor_data.dropna(inplace=True)


In [46]:
rumor_data.shape

(28905, 11)

In [47]:
rumor_data.head(1)

Unnamed: 0,tweet_id,user_id,date,src_tweet_id,rumor_label,original_tweet,cleaned_tweet,sentiment,emotion,source_reply,event
0,5.527877945031434e+17,202572421.0,2015-01-07 11:24:15+00:00,552783238415265792,1,@H_E_Samuel @George_Berridge @michael_taggart ...,religion peace strike,Positive,anger,reply,charliehebdo


28905 rumorous tweets in five killing events

# Label tweets which targeted any group

In [48]:
'''
Updated keyword list file with filename: '12targetgroup_words.text'
read into python : target_list
'''

"\nUpdated keyword list file with filename: '12targetgroup_words.text'\nread into python : target_list\n"

In [49]:
with open('12targetgroup_words.txt') as f:
    lines = f.readlines()
target_list = [line.strip() for line in lines]

In [56]:
print(target_list)
print("\n# of keywords: ", len(target_list))

['allah', 'attack', 'black', 'christian', 'christians', 'condemn', 'flag', 'free', 'freedom', 'gaza', 'group', 'imam', 'iraq', 'isis', 'islam', 'islamic', 'islamist', 'israel', 'jehovah', 'jihad', 'jihadist', 'militarize', 'militarize', 'military', 'missouri', 'mossad', 'murder', 'muslim', 'muslims', 'pilot', 'prophet', 'public', 'religion', 'shahadah', 'suicide', 'terrorism', 'terrorist', 'terrorists', 'unarm', 'white', 'zionist']

# of keywords:  41


In [51]:
'''
Create a binary variable into rumor_data dataframe - 'target_label' : 
    1, if cleaned tweets contained any keyword in the pre-defined keyword list; 
    0, otherwise.
'''

rumor_data['target_label'] = rumor_data.cleaned_tweet.astype(str).apply(lambda x: 1 
                                                                        if any(word in target_list for word in x.split()) 
                                                                        else 0 )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rumor_data['target_label'] = rumor_data.cleaned_tweet.astype(str).apply(lambda x: 1


In [52]:
pd.pivot_table(rumor_data, values='tweet_id', index='event', columns='target_label',
               aggfunc='count', margins = True)

target_label,0,1,All
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
charliehebdo,5031,1618,6649
ferguson,5505,715,6220
germanwings,1894,343,2237
ottawashooting,5024,826,5850
sydneysiege,6236,1713,7949
All,23690,5215,28905


In [53]:
table = pd.pivot_table(rumor_data, index=['event', 'source_reply'],columns='target_label',values='tweet_id',aggfunc='count',margins=True)
# table['target_0'] = (table.C / table.groupby(level=0).C.transform(sum) * 100).astype(str) + '%'
# print table
table.columns = ['no_target','target','total']
table

Unnamed: 0_level_0,Unnamed: 1_level_0,no_target,target,total
event,source_reply,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
charliehebdo,reply,4732,1473,6205
charliehebdo,source,299,145,444
ferguson,reply,5264,674,5938
ferguson,source,241,41,282
germanwings,reply,1745,291,2036
germanwings,source,149,52,201
ottawashooting,reply,4609,787,5396
ottawashooting,source,415,39,454
sydneysiege,reply,5861,1589,7450
sydneysiege,source,375,124,499


In [54]:
table.pipe(lambda d: d.div(d['total'], axis='index')).applymap('{:.2%}'.format)

Unnamed: 0_level_0,Unnamed: 1_level_0,no_target,target,total
event,source_reply,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
charliehebdo,reply,76.26%,23.74%,100.00%
charliehebdo,source,67.34%,32.66%,100.00%
ferguson,reply,88.65%,11.35%,100.00%
ferguson,source,85.46%,14.54%,100.00%
germanwings,reply,85.71%,14.29%,100.00%
germanwings,source,74.13%,25.87%,100.00%
ottawashooting,reply,85.42%,14.58%,100.00%
ottawashooting,source,91.41%,8.59%,100.00%
sydneysiege,reply,78.67%,21.33%,100.00%
sydneysiege,source,75.15%,24.85%,100.00%


In [55]:
#pd.set_option('display.max_colwidth', None)
# rumor_data.loc[0,['event','target_label','original_tweet']]    

## Save rumor_data to "targetgroup.csv" file. 

**targetgroup.csv** is a dataset which contains only rumorous tweets and their replies from five killing events(Charlie Hebdo, Ferguson, Germanwings, Ottawashooting, Sydney Siege), and was labled if targeting groups or not. 

In [None]:
rumor_data.to_csv(r'D:\論文\PHEME9\Data\CSV\targetgroup.csv',index=False)

# Sample from each events and Save to 5 csv files

In [None]:
event_size = rumor_data.groupby(['event']).size().to_list()
event_size

In [None]:
'''
Sample 100 for 5 events and save them as 5 csv files. 
'''
j = 0
for j in range(5):
    #random sampling 50 from each target_label (either ==0 or ==1) 
    nontarget_data = rumor_data.loc[(rumor_data.event==killing_events[j])&(rumor_data.target_label ==0 )].sample(n=50,random_state = 30)
    target_data = rumor_data.loc[(rumor_data.event==killing_events[j])&(rumor_data.target_label ==1 )].sample(n=50,random_state = 30)
    sampling_data = pd.concat([nontarget_data, target_data], ignore_index=True)
    
#     path_targetsamples = r'D:\論文\PHEME9\Data\CSV\targetsamples_1012'
#     path_save = pathlib.Path(path_targetsamples, killing_events[j]+'-targetsample.csv')
#     sampling_data.to_csv(path_save,index=False)
    
    print(sampling_data.value_counts(['target_label']))
    print('event: ',killing_events[j])
    j +=1

In [None]:
'''
# random sample 10 tweets from each event.
sample_index = []
i = 0
for size in event_size:

    # random sample 10 tweets from each event. 
    a = list(np.random.randint(i,i+size-1,10))
    sample_index.extend(a)
    i += size
    print('event size: ',size)
    #print(i)
len(sample_index)
'''

## sentiment analysis

1. When rumorous tweets targeted the Muslims/Terrorist group, they tended to have negative sentiment. 
2. The sentiment among those rumorous tweets which didn't explicitly target any group was almost evenly distributed.

In [None]:
g = rumor_data.groupby(['target_label'])['sentiment'].value_counts(normalize=True).mul(100).rename('count').reset_index()\
.pipe((sns.barplot,'data'), x ='target_label',y='count',hue='sentiment',palette="Set2")

g.axes.set_ylim(0,100)
g.axes.set_title('Sentiment Analysis of Rumor Tweet w.r.t (Non-/)Targeted group')
g.axes.set_xticklabels(['Non-Targeted','Targeted'])
g.axes.set_ylabel('Count (%)')

for p in g.axes.patches:
    txt = str(p.get_height().round(1)) + '%'
    txt_x = p.get_x() 
    txt_y = p.get_height()
    g.axes.text(txt_x,txt_y,txt)
# plt.savefig('D:/論文/PHEME9/Code/graph/targetgroup-sentiment.png',dpi=300)

In [None]:
rumor_data[rumor_data.target_label==1].groupby(['event'])['sentiment'].value_counts(normalize=True).to_frame()

In [None]:
rumor_data[rumor_data.target_label==1].groupby(['event'])['sentiment'].value_counts(normalize=True).mul(100).rename('count').reset_index()\
.pipe((sns.lineplot,'data'), x ='event',y='count',hue='sentiment',palette="Set2")