In [37]:
import pandas as pd
import numpy as np
import scipy.stats as stats

# pipeline functions
from pipeline import *

# plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
import chart_studio.plotly as py
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline

# run plotly in jupyter
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/terror_db.csv')

In [3]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

#### Considering Post 9/11 data

In [4]:
# select events post 9/11 terror attack
# select events where terrorism is certin
# drop events where attack/weapon type is uncertin
post_df = df[
    (df.eventid >= 200109110004) & 
    (df.doubtterr == 0) &
    (df.attacktype1 != 9) &
    (df.weaptype1 != 13)
]

#### What countries have the most attacks?
* by count
* by percentage

In [None]:
x = post_df[['country_txt', 'country']].value_counts()
top_contries_count = x[:5]
top_contries_count

In [None]:
x = post_df[['country_txt', 'country']].value_counts(normalize=True)
top_contries_percent = x[:5]
top_contries_percent

In [None]:
top_df = post_df[
    (post_df.country_txt == 'Iraq') | 
    (post_df.country_txt == 'Pakistan') | 
    (post_df.country_txt == 'Afghanistan') | 
    (post_df.country_txt == 'India') | 
    (post_df.country_txt == 'Colombia')]

In [None]:
fig = px.histogram(top_df, x='country_txt', color='attacktype1_txt',
                  width=800, height=400).update_xaxes(categoryorder="total descending")
fig.update_layout(
    xaxis_title_text='Countries',
    yaxis_title_text='Number of Attacks',
    bargap=0.2, showlegend=True,
    legend_title_text='Attack Types'
)
fig.show();

#### Terrorist Attacks over the years

In [None]:
fig = px.histogram(df, x='iyear', color='attacktype1_txt',
                  width=800, height=400)
fig.update_layout(
    xaxis_title_text='Years',
    yaxis_title_text='Number of Attacks',
    bargap=0.2, showlegend=True,
    legend_title_text='Attack Types'
)
fig.show()

#### Global Map, mapping the number of casualties over time

In [None]:

fig = px.density_mapbox(post_df, lat='latitude', lon='longitude', z='nkill',
                        radius=10, zoom=0.7, center=dict(lat=26, lon=10),
                        mapbox_style="stamen-terrain", animation_frame='iyear', animation_group='country')
fig.show()

In [None]:
'''
Notes
I can drop alternative, alternative_txt

How do the atk types rank amongst eachother,
which ones are more successful than others.
Go through attacks, what made them successful vs unsuccsessful

What is the interaction between atk type, wpn type (note:(weapsubtype1 is more accurate),
target type, targtype1, targsubtype1

highest perp group? (gname)
highest perp number? (nperps)
casualties (nkill) or (nkillus) or (nwound)
perp injured (nwoundte)

where are attacks more idiologically domestic vs. international
INT_IDEO

'''

#### Top Terrorist Groups

In [None]:
x = post_df.gname.value_counts()
top_terror_groups = x[1:6]
top_terror_groups

In [None]:
from pipeline import find_ratios

success_keys, success_vals, fail_keys, fail_vals = find_ratios(top_df, 'attacktype1_txt')

# create figure
fig = go.Figure(data=[
    go.Bar(name='successful', x=success_keys, y=success_vals, marker_color='#BC310E'),
    go.Bar(name='unsuccessful', x=fail_keys, y=fail_vals, marker_color='#3992A4')
])

# update figure
fig.update_layout(barmode='group',
                  title_text='Successes by Attack Type',
                  title_x=0.5,
                  xaxis_title_text='Attack Type',
                  yaxis_title_text='% Successful',
                  bargap=0.2,
                  showlegend=True,
                 )
fig.show()

In [None]:
'''success rate per top affected areas'''

from pipeline import find_ratios

success_keys, success_vals, fail_keys, fail_vals = find_ratios(top_df, 'country_txt')

# create figure
fig = go.Figure(data=[
    go.Bar(name='successful', x=success_keys, y=success_vals, marker_color='#BC310E'),
    go.Bar(name='unsuccessful', x=fail_keys, y=fail_vals, marker_color='#3992A4')
])

# update figure
fig.update_layout(barmode='group',
                  title_text='Successes by Attack Type',
                  title_x=0.5,
                  xaxis_title_text='Attack Type',
                  yaxis_title_text='% Successful',
                  bargap=0.2,
                  showlegend=True,
                 )
fig.show()

In [None]:
plt.figure(figsize=(8,5))
corrMatrix = atk_df.corr()
sns.heatmap(corrMatrix, annot=True, cmap='Blues')
plt.show()

In [None]:
'''
corr between wpn types and success
pearson corr
'''

wpn_df = pd.get_dummies(post_df['weaptype1_txt'])
wpn_df['success'] = post_df.success

plt.figure(figsize=(10,8))
corrMatrix = wpn_df.corr()
sns.heatmap(corrMatrix, annot=True, cmap='Blues')
plt.show()


In [None]:
!ls data

In [28]:
sec_df = pd.read_csv('data/2nd_edu.csv')
no_df = pd.read_csv('data/no_edu.csv')

In [29]:
combined = create_combined_frame(df, sec_df)
sec_df = event_count(combined)

In [30]:
sec_df.corr()

Unnamed: 0,education,nattacks
education,1.0,-0.074023
nattacks,-0.074023,1.0


array([[ 44.85666667, 935.        ],
       [ 47.29416667, 540.        ],
       [ 50.84166667, 326.        ],
       ...,
       [ 25.25      ,          nan],
       [ 35.02083333,   7.        ],
       [ 48.08416667,   4.        ]])

In [70]:

from sklearn import preprocessing

x = sec_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
norm_df = pd.DataFrame(x_scaled)

In [71]:
norm_df

Unnamed: 0,0,1
0,0.675647,0.087216
1,0.712404,0.050331
2,0.765900,0.030348
3,0.695879,0.017835
4,0.650992,0.013820
...,...,...
1129,0.231537,
1130,0.296141,
1131,0.379984,
1132,0.527326,0.000560


In [32]:
fig = px.scatter(sec_df, x="education", y="nattacks", trendline="ols", color='education')
fig.show()

In [17]:
combined = create_combined_frame(df, sec_df)
sec_df = event_count(combined)

In [18]:
sec_df.corr()

Unnamed: 0,education,nattacks
education,1.0,-0.063848
nattacks,-0.063848,1.0


In [13]:
fig = px.scatter(sec_df, x="education", y="nattacks", trendline="ols", color='education')
fig.show()

In [9]:
combined = create_combined_frame(df, no_df)
no_df = event_count(combined)

In [10]:
no_df.corr()

Unnamed: 0,education,nattacks
education,1.0,0.065827
nattacks,0.065827,1.0


In [11]:
fig = px.scatter(no_df, x="education", y="nattacks", trendline="ols", color='education')
fig.show()

#### top 30

In [13]:
sec_combined = create_combined_frame(df, sec_df)
top_30 = sec_combined.sort_values(by='total_attacks', ascending=False).iloc[:30, :]

thirty_sec_df = event_count(top_30)

In [14]:
thirty_sec_df.corr()

Unnamed: 0,education,nattacks
education,1.0,-0.141933
nattacks,-0.141933,1.0


In [15]:
fig = px.scatter(thirty_sec_df, x="education", y="nattacks", trendline="ols", color='education')
fig.show()

In [16]:
no_combined = create_combined_frame(df, no_df)
top_30 = no_combined.sort_values(by='total_attacks', ascending=False).iloc[:30, :]

thirty_no_df = event_count(top_30)

In [17]:
thirty_no_df.corr()

Unnamed: 0,education,nattacks
education,1.0,0.169071
nattacks,0.169071,1.0


In [18]:
fig = px.scatter(thirty_no_df, x="education", y="nattacks", trendline="ols", color='education')
fig.show()

In [19]:
out_sec_combined = create_combined_frame(df, sec_df)

q_low = out_sec_combined["total_attacks"].quantile(0.01)
q_hi  = out_sec_combined["total_attacks"].quantile(0.99)

sec_df_filtered = out_sec_combined[(out_sec_combined["total_attacks"] < q_hi) & (out_sec_combined["total_attacks"] > q_low)]

sec_df_filt = event_count(sec_df_filtered)

In [20]:
sec_df_filt.corr()

Unnamed: 0,education,nattacks
education,1.0,-0.172432
nattacks,-0.172432,1.0


In [21]:
fig = px.scatter(sec_df_filt, x="education", y="nattacks", trendline="ols", color='education')
fig.show()

In [22]:
out_no_combined = create_combined_frame(df, no_df)

q_low = out_no_combined["total_attacks"].quantile(0.05)
q_hi  = out_no_combined["total_attacks"].quantile(0.95)

no_df_filtered = out_no_combined[(out_no_combined["total_attacks"] < q_hi) & (out_no_combined["total_attacks"] > q_low)]

no_df_filt = event_count(no_df_filtered)

In [23]:
no_df_filt.corr()

Unnamed: 0,education,nattacks
education,1.0,-0.038685
nattacks,-0.038685,1.0


In [24]:
fig = px.scatter(no_df_filt, x="education", y="nattacks", trendline="ols", color='education')
fig.show()