In [49]:
import pandas as pd
import numpy as np
import scipy.stats as stats

# pipeline functions
from pipeline import *

# plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
import chart_studio.plotly as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%matplotlib inline

# machine learning
from sklearn.linear_model import LinearRegression

# run plotly in jupyter
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('data/terror_db.csv')

In [4]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [5]:
sec_df = pd.read_csv('data/2nd_edu.csv')
no_df = pd.read_csv('data/no_edu.csv')

In [10]:
combined = create_combined_frame(df, sec_df)
sec_df_merged = event_count(combined)

In [11]:
sec_df_merged.corr()

Unnamed: 0,education,nattacks
education,1.0,-0.053919
nattacks,-0.053919,1.0


In [12]:
fig = px.scatter(sec_df_merged, x="education", y="nattacks", trendline="ols", color='education')
fig.show()

In [9]:
combined = create_combined_frame(df, no_df)
no_df_merged = event_count(combined)

In [10]:
no_df_merged.corr()

Unnamed: 0,education,nattacks
education,1.0,0.065827
nattacks,0.065827,1.0


In [11]:
fig = px.scatter(no_df, x="education", y="nattacks", trendline="ols", color='education')
fig.show()

#### Top 30

In [19]:
sec_combined = create_combined_frame(df, sec_df)
top_30 = sec_combined.sort_values(by='total_attacks', ascending=False).iloc[:30, :]

thirty_sec_df = event_count(top_30)

In [20]:
thirty_sec_df.corr()

Unnamed: 0,education,nattacks
education,1.0,-0.141933
nattacks,-0.141933,1.0


In [20]:
fig = px.scatter(thirty_sec_df, x="education", y="nattacks", trendline="ols", color='education')
fig.show()

In [21]:
no_combined = create_combined_frame(df, no_df)
top_30 = no_combined.sort_values(by='total_attacks', ascending=False).iloc[:30, :]

thirty_no_df = event_count(top_30)

In [22]:
thirty_no_df.corr()

Unnamed: 0,education,nattacks
education,1.0,0.169071
nattacks,0.169071,1.0


In [23]:
fig = px.scatter(thirty_no_df, x="education", y="nattacks", trendline="ols", color='education')
fig.show()

In [16]:
out_sec_combined = create_combined_frame(df, sec_df)

q_low = out_sec_combined["total_attacks"].quantile(0.01)
q_hi  = out_sec_combined["total_attacks"].quantile(0.99)

sec_df_filtered = out_sec_combined[(out_sec_combined["total_attacks"] < q_hi) & (out_sec_combined["total_attacks"] > q_low)]

sec_df_filt = event_count(sec_df_filtered)

In [17]:
sec_df_filt.corr()

Unnamed: 0,education,nattacks
education,1.0,-0.172432
nattacks,-0.172432,1.0


In [26]:
fig = px.scatter(sec_df_filt, x="education", y="nattacks", trendline="ols", color='education')
fig.show()

In [15]:
out_no_combined = create_combined_frame(df, no_df)

q_low = out_no_combined["total_attacks"].quantile(0.05)
q_hi  = out_no_combined["total_attacks"].quantile(0.95)

no_df_filtered = out_no_combined[(out_no_combined["total_attacks"] < q_hi) & (out_no_combined["total_attacks"] > q_low)]

no_df_filt = event_count(no_df_filtered)

In [28]:
no_df_filt.corr()

Unnamed: 0,education,nattacks
education,1.0,-0.038685
nattacks,-0.038685,1.0


In [29]:
fig = px.scatter(no_df_filt, x="education", y="nattacks", trendline="ols", color='education')
fig.show()

#### Hypothesis Testing

In [30]:
'''
Is there a relationship between education and attacks?
'''

'\nIs there a relationship between education and attacks?\n'

In [33]:
combined = create_combined_frame(df, sec_df)


# sec_df_merged = event_count(combined)
combined

AttributeError: 'DataFrame' object has no attribute 'iyear'

In [27]:
'''
How does this change in 5 year increments?
'''

lst_df = event_count(combined)
med = lst_df.education.median()
mean = lst_df.education.median()

low_ed = lst_df[lst_df.education <= med]
high_ed = lst_df[lst_df.education > med]


In [47]:
low_ed

Unnamed: 0,education,nattacks
9,4.749167,777.0
10,5.382500,867.0
11,5.840000,693.0
12,6.006667,622.0
13,6.268333,939.0
...,...,...
1087,10.421667,0.0
1116,11.110000,0.0
1125,3.879167,0.0
1126,5.349167,0.0


In [52]:
Y_1 = low_ed.nattacks
X_1 = low_ed.education

Y_2 = high_ed.nattacks
X_2 = high_ed.education

# regression
reg = LinearRegression().fit(np.vstack(X_1), Y_1)
low_ed['low_ed_bestfit'] = reg.predict(np.vstack(X_1))

reg = LinearRegression().fit(np.vstack(X_2), Y_2)
high_ed['high_ed_bestfit'] = reg.predict(np.vstack(X_2))



In [59]:

fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(x=low_ed.education, y=low_ed.nattacks, mode='markers',
              marker_color=low_ed.education, text=low_ed.education),
    row=1, col=1)

fig.add_trace(
    go.Scatter(x=high_ed.education, y=high_ed.nattacks, mode='markers',
              marker_color=high_ed.education),
    row=1, col=2)

fig.update_layout(
    height=500, width=1000, 
    title_text="Comparing Low Education vs High Education",
    showlegend=True,
    title_x=0.5)


fig.update_xaxes(title_text="% Women Completed Secondary School", row=1, col=1)
fig.update_xaxes(title_text="% Women Completed Secondary School", row=1, col=2)

fig.update_yaxes(title_text="Number of Terrorist Events", row=1, col=1)
fig.update_yaxes(title_text="Number of Terrorist Events", range=[-800, 12000], row=1, col=2)


fig.add_trace(go.Scatter(name='killed', x=X_1, y=low_ed['low_ed_bestfit'], mode='lines',
                        line=dict(color='#E48446' , width=3)), row=1, col=1)
fig.add_trace(go.Scatter(name='injured', x=X_2, y=high_ed['high_ed_bestfit'], mode='lines',
                        line=dict(color='#E48446', width=3)), row=1, col=2)
fig.show()

In [24]:
t_test = stats.ttest_ind(low_ed.nattacks, high_ed.nattacks)
pearson = stats.pearsonr(low_ed.nattacks, high_ed.nattacks)
spearman = stats.spearmanr(low_ed.nattacks, high_ed.nattacks)
print(t_test)
print("_"*40)
print(pearson)
print("_"*40)
print(spearman)

Ttest_indResult(statistic=2.2536640854890457, pvalue=0.02440800655133116)
________________________________________
(0.029379980396619244, 0.4850546577328326)
________________________________________
SpearmanrResult(correlation=0.3085496632800628, pvalue=5.690038307527032e-14)


In [61]:
'''plot distributions'''


low_mean = np.mean(low_ed.nattacks)
high_mean = np.mean(high_ed.nattacks)

low_std = np.std(low_ed.nattacks)
high_std = np.std(high_ed.nattacks)

low_err = np.std(low_ed.nattacks) / len(low_ed)
high_err = np.std(high_ed.nattacks) / len(high_ed)

low_nums = stats.norm(low_mean, low_std).rvs(10000)
high_nums = stats.norm(high_mean, high_std).rvs(10000)

fig = go.Figure()
fig.add_trace(go.Histogram(x=low_nums, name="Low Education"))
fig.add_trace(go.Histogram(x=high_nums, name="High Education"))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms

fig.update_layout(title_text='Simulating Low/High Education Attacks',
                  title_x=0.5,
                  xaxis_title_text='Number of Attacks',
                  yaxis_title_text='Frequency of Value',
                  showlegend=True,
                 )
fig.show()

In [61]:
# q_low = combined["total_attacks"].quantile(0.05)
# q_hi  = combined["total_attacks"].quantile(0.95)

# df_filtered = combined[(combined["total_attacks"] < q_hi) & (combined["total_attacks"] > q_low)]


In [101]:
stats.pearsonr(sec_df_merged.education, sec_df_merged.nattacks.fillna(0))

(-0.05391923329789123, 0.06951720479296539)

In [115]:
stats.pearsonr(thirty_sec_df.education, thirty_sec_df.nattacks.fillna(0))

(-0.1419334390062965, 0.019637219767777197)

In [114]:
stats.pearsonr(thirty_no_df.education, thirty_no_df.nattacks.fillna(0))

(0.16907059389746623, 0.005347766273072314)