In [34]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import os

# pipeline functions
from pipeline import *

# plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
import chart_studio.plotly as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%matplotlib inline

# machine learning
from sklearn.linear_model import LinearRegression

# run plotly in jupyter
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [35]:
df = pd.read_csv('data/terror_db.csv')

In [36]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

#### Considering Post 9/11 data

In [37]:
# select events post 9/11 terror attack
# select events where terrorism is certin
# drop events where attack/weapon type is uncertin
post_df = df[
    (df.eventid >= 200109110004) & 
    (df.doubtterr == 0) &
    (df.attacktype1 != 9) &
    (df.weaptype1 != 13)
]

#### What countries have the most attacks?
* by count
* by percentage

In [58]:
x = post_df[['country_txt', 'country']].value_counts()
top_contries_count = x[:5]
top_contries_count

country_txt  country
Iraq         95         21740
Pakistan     153        10878
Afghanistan  4          10408
India        92          7895
Philippines  160         3838
dtype: int64

In [59]:
x = post_df[['country_txt', 'country']].value_counts(normalize=True)
top_contries_percent = x[:5]
top_contries_percent

country_txt  country
Iraq         95         0.237057
Pakistan     153        0.118616
Afghanistan  4          0.113491
India        92         0.086088
Philippines  160        0.041850
dtype: float64

In [60]:
# create dataframe with top countries
top_df = post_df[
    (post_df.country_txt == 'Iraq') | 
    (post_df.country_txt == 'Pakistan') | 
    (post_df.country_txt == 'Afghanistan') | 
    (post_df.country_txt == 'India') | 
    (post_df.country_txt == 'Colombia')]

In [109]:
fig = px.histogram(top_df, x='country_txt', color='attacktype1_txt',
                  width=800, height=400).update_xaxes(categoryorder="total descending")
fig.update_layout(
    xaxis_title_text='Countries',
    yaxis_title_text='Number of Attacks',
    bargap=0.2, showlegend=True,
    legend_title_text='Attack Types',
    title_text='Top Countries Affected By Terrorism Post 9/11',
    title_x=0.5,
)
fig.show();
fig.write_image("images/top_countries.png")

#### Terrorist Attacks over the years

In [110]:
fig = px.histogram(df, x='iyear', color='attacktype1_txt',
                  width=800, height=400)
fig.update_layout(
    xaxis_title_text='Years',
    yaxis_title_text='Number of Attacks',
    bargap=0.2, showlegend=True,
    legend_title_text='Attack Types',
    title_text='Acts of Terrorism Per Year',
    title_x=0.5,
)
fig.show()
fig.write_image("images/acts_per_year.png")

#### Global Map, mapping the number of casualties over time

In [33]:
# fig = px.scatter_geo(post_df, lat='latitude', lon='longitude',
#                      hover_name="country_txt",
#                      projection="natural earth")

# fig.update_layout(
#     title_text='Acts of Terrorism',
#     title_x=0.5)
# fig.show()
# fig.write_image("images/global_map.png")

#### Top Terrorist Groups

In [112]:
x = post_df.gname.value_counts()
top_terror_groups = x[1:6]
top_terror_groups

Taliban                                           5859
Islamic State of Iraq and the Levant (ISIL)       4073
Boko Haram                                        2168
Al-Shabaab                                        2018
Communist Party of India - Maoist (CPI-Maoist)    1697
Name: gname, dtype: int64

#### What attack types are the most successful?

In [115]:
from pipeline import find_ratios

success_keys, success_vals, fail_keys, fail_vals = find_ratios(top_df, 'attacktype1_txt')

# create figure
fig = go.Figure(data=[
    go.Bar(name='successful', x=success_keys, y=success_vals, marker_color='#BC310E'),
    go.Bar(name='unsuccessful', x=fail_keys, y=fail_vals, marker_color='#3992A4')
])

# update figure
fig.update_layout(barmode='group',
                  title_text='Successes by Attack Type',
                  title_x=0.5,
                  xaxis_title_text='Attack Type',
                  yaxis_title_text='% Successful',
                  bargap=0.2,
                  showlegend=True,
                 )
fig.show()
# fig.write_image("images/suc_by_atk.png")

#### How successful are the top countries at dealing with the threat of terrorism?

In [117]:
'''success rate per top affected areas'''

from pipeline import find_ratios

success_keys, success_vals, fail_keys, fail_vals = find_ratios(top_df, 'country_txt')

# create figure
fig = go.Figure(data=[
    go.Bar(name='successful', x=success_keys, y=success_vals, marker_color='#BC310E'),
    go.Bar(name='unsuccessful', x=fail_keys, y=fail_vals, marker_color='#3992A4')
])

# update figure
fig.update_layout(barmode='group',
                  title_text='Successes by Top 5 Affected Country',
                  title_x=0.5,
                  xaxis_title_text='Country',
                  yaxis_title_text='% Successful',
                  bargap=0.2,
                  showlegend=True,
                 )
fig.show()
# fig.write_image("images/suc_by_co.png")

In [38]:
year_agg = post_df.groupby('iyear').agg('count')['eventid']

In [39]:
wounded = post_df.groupby('iyear').sum()[['nkill', 'nwound']]

In [40]:
casualties = pd.merge(wounded, year_agg, left_on=wounded.index, right_on=year_agg.index)

In [41]:
casualties['nkill_ratio'] = casualties['nkill'] / casualties['eventid']
casualties['nwound_ratio'] = casualties['nwound'] / casualties['eventid']
casualties = casualties.iloc[1:, :]

In [122]:
Y_1 = casualties['nkill_ratio']
Y_2 = casualties['nwound_ratio']
X = casualties.index

# regression
reg = LinearRegression().fit(np.vstack(X), Y_1)
casualties['nkill_bestfit'] = reg.predict(np.vstack(X))

reg = LinearRegression().fit(np.vstack(X), Y_2)
casualties['nwounded_bestfit'] = reg.predict(np.vstack(X))



#### What is the ratio of casualties to acts of terrorism?

In [137]:


fig = go.Figure(data=[
    go.Bar(name='killed', x=casualties.index, y=casualties.nkill_ratio, marker_color='#BC310E'),
    go.Bar(name='wounded', x=casualties.index, y=casualties.nwound_ratio, marker_color='#3992A4')
])

# update figure
fig.update_layout(barmode='group',
                  title_text='Casualties of Terror Attacks Per Attack by Year',
                  title_x=0.5,
                  xaxis_title_text='Year',
                  yaxis_title_text='Number of Casualties Per Attack',
                  bargap=0.4,
                  showlegend=True,
                 )

fig.add_trace(go.Scatter(name='killed', x=X, y=casualties['nkill_bestfit'], mode='lines',
                        line=dict(color='#BC310E', width=3)))
fig.add_trace(go.Scatter(name='injured', x=X, y=casualties['nwounded_bestfit'], mode='lines',
                        line=dict(color='#3992A4', width=3)))

fig.show()
# fig.write_image("images/cas_per_atk.png")

In [139]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(x=casualties.index, y=casualties.nkill_ratio, mode='markers',
              marker_color=casualties.nkill_ratio, text=casualties.nkill_ratio, showlegend=False),
    row=1, col=1)

fig.add_trace(
    go.Scatter(x=casualties.index, y=casualties.nwound_ratio, mode='markers',
              marker_color=casualties.nwound_ratio, showlegend=False),
    row=1, col=2)

fig.update_layout(
    height=500, width=1000, 
    title_text="Average Casualties Per Attack Per Year",
    showlegend=True,
    title_x=0.5)


fig.update_xaxes(title_text="Years", row=1, col=1)
fig.update_xaxes(title_text="Years", row=1, col=2)

fig.update_yaxes(title_text="Killed per Event", row=1, col=1)
fig.update_yaxes(title_text="Injured per Event", row=1, col=2)

fig.add_trace(go.Scatter(name='killed', x=X, y=casualties['nkill_bestfit'], mode='lines',
                        line=dict(color='#BC310E', width=3)), row=1, col=1)
fig.add_trace(go.Scatter(name='injured', x=X, y=casualties['nwounded_bestfit'], mode='lines',
                        line=dict(color='#3992A4', width=3)), row=1, col=2)
fig.show()
# fig.write_image("images/cas_per_atk_dot.png")

#### What is the relationship between education and terrorism?

In [42]:
secondary_df = pd.read_csv('data/2nd_edu.csv')
no_ed_df = pd.read_csv('data/no_edu.csv')

In [43]:
merged_second_df = create_combined_frame(df, secondary_df)
merged_none_df = create_combined_frame(df, no_ed_df)

# fill nans
merged_second_df.fillna(0, inplace=True)
merged_none_df.fillna(0, inplace=True)

In [44]:
# create mean events and education column per country
merged_second_df['mean_attacks'] = (merged_second_df['1970_events'] + 
                                    merged_second_df['1975_events'] +
                                    merged_second_df['1980_events'] +
                                    merged_second_df['1985_events'] +
                                    merged_second_df['1990_events'] +
                                    merged_second_df['1995_events'] +
                                    merged_second_df['2000_events'] +
                                    merged_second_df['2005_events'] +
                                    merged_second_df['2010_events']) / 9

merged_second_df['mean_education'] = (merged_second_df['1970'] + 
                                    merged_second_df['1975'] +
                                    merged_second_df['1980'] +
                                    merged_second_df['1985'] +
                                    merged_second_df['1990'] +
                                    merged_second_df['1995'] +
                                    merged_second_df['2000'] +
                                    merged_second_df['2005'] +
                                    merged_second_df['2010']) / 9


merged_none_df['mean_attacks'] = (merged_none_df['1970_events'] + 
                                    merged_none_df['1975_events'] +
                                    merged_none_df['1980_events'] +
                                    merged_none_df['1985_events'] +
                                    merged_none_df['1990_events'] +
                                    merged_none_df['1995_events'] +
                                    merged_none_df['2000_events'] +
                                    merged_none_df['2005_events'] +
                                    merged_none_df['2010_events']) / 9

merged_none_df['mean_education'] = (merged_none_df['1970'] + 
                                    merged_none_df['1975'] +
                                    merged_none_df['1980'] +
                                    merged_none_df['1985'] +
                                    merged_none_df['1990'] +
                                    merged_none_df['1995'] +
                                    merged_none_df['2000'] +
                                    merged_none_df['2005'] +
                                    merged_none_df['2010']) / 9

In [46]:
print(np.mean(merged_second_df.mean_attacks))
print(np.var(merged_second_df.mean_attacks))
print(np.std(merged_second_df.mean_attacks))

print(np.mean(merged_second_df.mean_education))
print(np.var(merged_second_df.mean_education))
print(np.std(merged_second_df.mean_education))

108.14462081128748
66916.06197723714
258.6813908599479
15.437272192827749
139.35153901785043
11.804725283455369


In [45]:
# Target and Feature
Y_1 = merged_second_df.mean_attacks
X_1 = merged_second_df.mean_education

Y_2 = merged_none_df.mean_attacks
X_2 = merged_none_df.mean_education

# regression to create line on plots
reg = LinearRegression().fit(np.vstack(X_1), Y_1)
merged_second_df['sec_ed_bestfit'] = reg.predict(np.vstack(X_1))

reg = LinearRegression().fit(np.vstack(X_2), Y_2)
merged_none_df['no_ed_bestfit'] = reg.predict(np.vstack(X_2))



In [146]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(x=merged_second_df.mean_education, y=merged_second_df.mean_attacks, mode='markers',
              marker_color=merged_second_df.mean_education, showlegend=False),
    row=1, col=1)

fig.add_trace(
    go.Scatter(x=merged_none_df.mean_education, y=merged_none_df.mean_attacks, mode='markers',
              marker_color=merged_none_df.mean_education, showlegend=False),
    row=1, col=2)

fig.update_layout(
    height=500, width=1000, 
    title_text="Education vs Number of Terror Attacks",
    showlegend=True,
    title_x=0.5)


fig.update_xaxes(title_text="% Women Completed Secondary School", row=1, col=1)
fig.update_xaxes(title_text="% Women With No Education", row=1, col=2)

fig.update_yaxes(title_text="Number of Terrorist Events", row=1, col=1)
fig.update_yaxes(title_text="Number of Terrorist Events", row=1, col=2)

# Add regression line
fig.add_trace(go.Scatter(name='Terrorist Events', x=X_1, y=merged_second_df['sec_ed_bestfit'], mode='lines',
                        line=dict(color='#E48446' , width=3), showlegend=False), row=1, col=1)
fig.add_trace(go.Scatter(name='Terrorist Events', x=X_2, y=merged_none_df['no_ed_bestfit'], mode='lines',
                        line=dict(color='#E48446', width=3)), row=1, col=2)
fig.show()
# fig.write_image("images/edu_vs_atk.png")

#### Distribution of education and terrorist attacks?

In [153]:
# It looks like the data is not linearly distributed
fig = make_subplots(rows=2, cols=2)

fig.add_trace(
    go.Histogram(x=merged_second_df.mean_education, histnorm='probability', name='distribution of secondary education'), row=1, col=1)

fig.add_trace(
    go.Histogram(x=merged_none_df.mean_education, histnorm='probability', name='distribution of no education'), row=1, col=2)

fig.add_trace(
    go.Histogram(x=merged_second_df.mean_attacks, histnorm='probability', name='distribution of secondary education'), row=2, col=1)

fig.add_trace(
    go.Histogram(x=merged_none_df.mean_attacks, histnorm='probability', name='distribution of no education'), row=2, col=2)



fig.update_layout( 
    title_text="Distribution of education and attacks",
    title_x=0.25,
    showlegend=True)

fig.update_xaxes(title_text="Secondary Education", row=1, col=1)
fig.update_xaxes(title_text="No Education", row=1, col=2)
fig.update_xaxes(title_text="Secondary Education Attacks", row=1, col=1)
fig.update_xaxes(title_text="No Edu Attacks", row=1, col=2)

fig.show()
# fig.write_image("images/edu_vs_atk_dis.png")

#### Due to skew of data, Spearman R is conducted to test significance of relationship

In [18]:
# How significant is this correlation
spearman = stats.spearmanr(merged_second_df.mean_education, merged_second_df.mean_attacks)
print(f"Spearman R: {spearman}")
spearman = stats.spearmanr(merged_none_df.mean_education, merged_none_df.mean_attacks)
print(f"Spearman R: {spearman}")

Spearman R: SpearmanrResult(correlation=-0.2277342225532225, pvalue=0.010328712516958242)
Spearman R: SpearmanrResult(correlation=0.23154399207009108, pvalue=0.009088055574197536)


#### How does a log transformation change the skew?

In [154]:
merged_second_df['log_mean_attacks'] = np.log((merged_second_df['1970_events'] + 
                                    merged_second_df['1975_events'] +
                                    merged_second_df['1980_events'] +
                                    merged_second_df['1985_events'] +
                                    merged_second_df['1990_events'] +
                                    merged_second_df['1995_events'] +
                                    merged_second_df['2000_events'] +
                                    merged_second_df['2005_events'] +
                                    merged_second_df['2010_events']) / 9)

merged_second_df['log_mean_education'] = np.log((merged_second_df['1970'] + 
                                    merged_second_df['1975'] +
                                    merged_second_df['1980'] +
                                    merged_second_df['1985'] +
                                    merged_second_df['1990'] +
                                    merged_second_df['1995'] +
                                    merged_second_df['2000'] +
                                    merged_second_df['2005'] +
                                    merged_second_df['2010']) / 9)


merged_none_df['log_mean_attacks'] = np.log((merged_none_df['1970_events'] + 
                                    merged_none_df['1975_events'] +
                                    merged_none_df['1980_events'] +
                                    merged_none_df['1985_events'] +
                                    merged_none_df['1990_events'] +
                                    merged_none_df['1995_events'] +
                                    merged_none_df['2000_events'] +
                                    merged_none_df['2005_events'] +
                                    merged_none_df['2010_events']) / 9)

merged_none_df['log_mean_education'] = np.log((merged_none_df['1970'] + 
                                    merged_none_df['1975'] +
                                    merged_none_df['1980'] +
                                    merged_none_df['1985'] +
                                    merged_none_df['1990'] +
                                    merged_none_df['1995'] +
                                    merged_none_df['2000'] +
                                    merged_none_df['2005'] +
                                    merged_none_df['2010']) / 9)

In [155]:
Y_1 = merged_second_df.log_mean_attacks
X_1 = merged_second_df.log_mean_education

Y_2 = merged_none_df.log_mean_attacks
X_2 = merged_none_df.log_mean_education

# regression to create line on plots
reg = LinearRegression().fit(np.vstack(X_1), Y_1)
merged_second_df['log_sec_ed_bestfit'] = reg.predict(np.vstack(X_1))

reg = LinearRegression().fit(np.vstack(X_2), Y_2)
merged_none_df['log_no_ed_bestfit'] = reg.predict(np.vstack(X_2))



In [158]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(x=merged_second_df.log_mean_education, y=merged_second_df.log_mean_attacks, mode='markers',
              marker_color=merged_second_df.mean_education, showlegend=False),
    row=1, col=1)

fig.add_trace(
    go.Scatter(x=merged_none_df.log_mean_education, y=merged_none_df.log_mean_attacks, mode='markers',
              marker_color=merged_none_df.mean_education, showlegend=False),
    row=1, col=2)

fig.update_layout(
    height=500, width=1000, 
    title_text="Log Education vs Log Number of Terror Attacks",
    showlegend=True,
    title_x=0.5)


fig.update_xaxes(title_text="% Women Completed Secondary School", row=1, col=1)
fig.update_xaxes(title_text="% Women With No Education", row=1, col=2)

fig.update_yaxes(title_text="Number of Terrorist Events", row=1, col=1)
fig.update_yaxes(title_text="Number of Terrorist Events", row=1, col=2)

# Add regression line
fig.add_trace(go.Scatter(name='Terrorist Events', x=X_1, y=merged_second_df['log_sec_ed_bestfit'], mode='lines',
                        line=dict(color='#E48446' , width=3), showlegend=False), row=1, col=1)
fig.add_trace(go.Scatter(name='Terrorist Events', x=X_2, y=merged_none_df['log_no_ed_bestfit'], mode='lines',
                        line=dict(color='#E48446', width=3)), row=1, col=2)
fig.show()
# fig.write_image("images/log_edu_vs_atk.png")

In [162]:
# It looks like the data is not linearly distributed
fig = make_subplots(rows=2, cols=2)

fig.add_trace(
    go.Histogram(x=merged_second_df.log_mean_education, histnorm='probability', name='distribution of secondary education'), row=1, col=1)

fig.add_trace(
    go.Histogram(x=merged_none_df.log_mean_education, histnorm='probability', name='distribution of no education'), row=1, col=2)

fig.add_trace(
    go.Histogram(x=merged_second_df.log_mean_attacks, histnorm='probability', name='distribution of secondary education'), row=2, col=1)

fig.add_trace(
    go.Histogram(x=merged_none_df.log_mean_education, histnorm='probability', name='distribution of no education'), row=2, col=2)


fig.update_layout( 
    title_text="Distribution of Values",
    title_x=0.25,
    showlegend=True)

fig.update_xaxes(title_text="Secondary Education", row=1, col=1)
fig.update_xaxes(title_text="No Education", row=1, col=2)
fig.update_xaxes(title_text="Secondary Education Attacks", row=1, col=1)
fig.update_xaxes(title_text="No Edu Attacks", row=1, col=2)

# fig.write_image("images/log_edu_vs_atk_dis.png")
fig.show()


#### Data is still significantly skewed, Spearman R is chosen to detect relationship

In [163]:
spearman = stats.spearmanr(merged_second_df.log_mean_education, merged_second_df.log_mean_attacks)
print(spearman)
spearman = stats.spearmanr(merged_none_df.log_mean_education, merged_none_df.log_mean_attacks)
print(spearman)

SpearmanrResult(correlation=-0.2277342225532225, pvalue=0.010328712516958242)
SpearmanrResult(correlation=0.23154399207009108, pvalue=0.009088055574197536)
