In [2]:
import pandas as pd
import numpy as np
import scipy.stats as stats

# pipeline/helper functions
from pipeline import *

# plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
import chart_studio.plotly as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
%matplotlib inline

# machine learning
from sklearn.linear_model import LinearRegression

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('data/terror_db.csv')

In [4]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [5]:
sec_df = pd.read_csv('data/2nd_edu.csv')
no_df = pd.read_csv('data/no_edu.csv')

In [121]:
combined = create_combined_frame(df, sec_df)
sec_df_merged = event_count(combined)

In [122]:
sec_df_merged.corr()

Unnamed: 0,education,nattacks
education,1.0,-0.053919
nattacks,-0.053919,1.0


In [124]:
fig = px.scatter(sec_df_merged, x="education", y="nattacks", trendline="ols", color='education')
fig.show()

In [129]:
combined = create_combined_frame(df, no_df)
no_df_merged = event_count(combined)

In [130]:
no_df_merged.corr()

Unnamed: 0,education,nattacks
education,1.0,0.045943
nattacks,0.045943,1.0


In [131]:
fig = px.scatter(no_df, x="education", y="nattacks", trendline="ols", color='education')
fig.show()

ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['1970', '1975', '1980', '1985', '1990', '1995', '2000', '2005', '2010', 'Country Name'] but received: education

#### Top 30

In [19]:
sec_combined = create_combined_frame(df, sec_df)
top_30 = sec_combined.sort_values(by='total_attacks', ascending=False).iloc[:30, :]

thirty_sec_df = event_count(top_30)

In [20]:
thirty_sec_df.corr()

Unnamed: 0,education,nattacks
education,1.0,-0.141933
nattacks,-0.141933,1.0


In [20]:
fig = px.scatter(thirty_sec_df, x="education", y="nattacks", trendline="ols", color='education')
fig.show()

In [21]:
no_combined = create_combined_frame(df, no_df)
top_30 = no_combined.sort_values(by='total_attacks', ascending=False).iloc[:30, :]

thirty_no_df = event_count(top_30)

In [22]:
thirty_no_df.corr()

Unnamed: 0,education,nattacks
education,1.0,0.169071
nattacks,0.169071,1.0


In [23]:
fig = px.scatter(thirty_no_df, x="education", y="nattacks", trendline="ols", color='education')
fig.show()

In [132]:
out_sec_combined = create_combined_frame(df, sec_df)

q_low = out_sec_combined["total_attacks"].quantile(0.01)
q_hi  = out_sec_combined["total_attacks"].quantile(0.99)

sec_df_filtered = out_sec_combined[(out_sec_combined["total_attacks"] < q_hi) & (out_sec_combined["total_attacks"] > q_low)]

sec_df_filt = event_count(sec_df_filtered)

In [133]:
sec_df_filt.corr()

Unnamed: 0,education,nattacks
education,1.0,-0.172432
nattacks,-0.172432,1.0


In [135]:
fig = px.scatter(sec_df_filt, x="education", y="nattacks", trendline="ols", color='education')
fig.show()

In [116]:
out_no_combined = create_combined_frame(df, no_df)

q_low = out_no_combined["total_attacks"].quantile(0.05)
q_hi  = out_no_combined["total_attacks"].quantile(0.95)

no_df_filtered = out_no_combined[(out_no_combined["total_attacks"] < q_hi) & (out_no_combined["total_attacks"] > q_low)]

no_df_filt = event_count(no_df_filtered)

In [117]:
no_df_filt.corr()

Unnamed: 0,education,nattacks
education,1.0,-0.038685
nattacks,-0.038685,1.0


In [118]:
fig = px.scatter(no_df_filt, x="education", y="nattacks", trendline="lowess", color='education')
fig.show()

#### Hypothesis Testing

In [30]:
'''
Is there a relationship between education and attacks?
'''

'\nIs there a relationship between education and attacks?\n'

In [33]:
combined = create_combined_frame(df, sec_df)


# sec_df_merged = event_count(combined)
combined

AttributeError: 'DataFrame' object has no attribute 'iyear'

In [158]:
combined = create_combined_frame(df, sec_df)
lst_df = event_count(combined)

lst_df['log_edu'] = np.log(lst_df['education'] + 1)
lst_df['log_nattacks'] = np.log(lst_df['nattacks'] + 1)

In [162]:
lst_df.corr()

Unnamed: 0,education,nattacks,log_edu,log_nattacks
education,1.0,-0.053919,0.88811,-0.02289
nattacks,-0.053919,1.0,-0.009296,0.493965
log_edu,0.88811,-0.009296,1.0,0.039818
log_nattacks,-0.02289,0.493965,0.039818,1.0


In [169]:

fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(x=lst_df.log_edu, y=lst_df.nattacks, mode='markers', showlegend=False),
    row=1, col=1)

fig.add_trace(
    go.Scatter(x=lst_df.log_edu, y=lst_df.log_nattacks, mode='markers', showlegend=False),
    row=1, col=2)

fig.update_layout(
    height=500, width=1000, 
#     title_text="Education vs Attacks",
    showlegend=True,
    title_x=0.5)


fig.update_xaxes(title_text="Log Edu", row=1, col=1)
fig.update_xaxes(title_text="Log Edu/Attacks", row=1, col=2)

# fig.update_yaxes(title_text="Number of Terrorist Events", row=1, col=1)
# fig.update_yaxes(title_text="Number of Terrorist Events", row=1, col=2)


fig.show()

In [167]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Histogram(x=lst_df.log_edu, histnorm='probability', name='education under median'), row=1, col=1)
#     go.Scatter(x=low_ed.education, y=low_ed.nattacks, mode='markers',
#               marker_color=low_ed.education, text=low_ed.education),
#     row=1, col=1)

fig.add_trace(
    go.Histogram(x=lst_df.log_nattacks, histnorm='probability', name='education over median'), row=1, col=2)
#     go.Scatter(x=high_ed.education, y=high_ed.nattacks, mode='markers',
#               marker_color=high_ed.education),
#     row=1, col=2)


fig.update_layout( 
#     title_text="Percentage of Women With No Education",
    showlegend=True,
    title_x=0.5,
     bargap=0.03,
    bargroupgap=0.2
)

fig.update_xaxes(title_text="Distribution of Log Education", row=1, col=1)
fig.update_xaxes(title_text="Distribution of Log Attacks", row=1, col=2)

fig.update_yaxes(title_text="% in Education Range",range=[0, .2], row=1, col=1)
fig.update_yaxes(title_text="% in Educated Range", range=[0, .2], row=1, col=2)

In [161]:
t_test = stats.ttest_ind(lst_df.log_edu, lst_df.log_nattacks)
pearson = stats.pearsonr(lst_df.log_edu, lst_df.log_nattacks)
spearman = stats.spearmanr(lst_df.log_edu, lst_df.log_nattacks)
print(f"Two Sample T Test: {t_test}")
print("_"*40)
print(f"Pearson R: {pearson}")
print("_"*40)
print(f"Spearman R: {spearman}")

Two Sample T Test: Ttest_indResult(statistic=3.570813209173087, pvalue=0.00036328497219664644)
________________________________________
Pearson R: (0.03981800134943947, 0.1802711997944533)
________________________________________
Spearman R: SpearmanrResult(correlation=0.04007026979904227, pvalue=0.17752449282128122)


In [170]:
t_test = stats.ttest_ind(lst_df.log_edu, lst_df.nattacks)
pearson = stats.pearsonr(lst_df.log_edu, lst_df.nattacks)
spearman = stats.spearmanr(lst_df.log_edu, lst_df.nattacks)
print(f"Two Sample T Test: {t_test}")
print("_"*40)
print(f"Pearson R: {pearson}")
print("_"*40)
print(f"Spearman R: {spearman}")

Two Sample T Test: Ttest_indResult(statistic=-6.8295137800912835, pvalue=1.0906027401407358e-11)
________________________________________
Pearson R: (-0.009295564298567321, 0.7545170418012545)
________________________________________
Spearman R: SpearmanrResult(correlation=0.04007026979904227, pvalue=0.17752449282128122)


#### How does education affect acts of terrorism
* This explores the relationship between the % of women that have completed secondary school and terrorist acts

In [66]:
# Merge secondary education completed with terrorism dataframe
combined = create_combined_frame(df, sec_df)

# Create a dataframe with an education column and number of attacks in 5 year intervals
lst_df = event_count(combined)

med = lst_df.education.median()
mean = lst_df.education.median()

# Dividing the frame between education bracets
# median used due to education skew
low_ed = lst_df[lst_df.education <= med]
high_ed = lst_df[lst_df.education > med]

In [67]:
Y_1 = low_ed.nattacks
X_1 = low_ed.education

Y_2 = high_ed.nattacks
X_2 = high_ed.education

# regression to create line on plots
reg = LinearRegression().fit(np.vstack(X_1), Y_1)
low_ed['low_ed_bestfit'] = reg.predict(np.vstack(X_1))

reg = LinearRegression().fit(np.vstack(X_2), Y_2)
high_ed['high_ed_bestfit'] = reg.predict(np.vstack(X_2))



In [137]:

fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(x=low_ed.education, y=low_ed.nattacks, mode='markers',
              marker_color=low_ed.education, text=low_ed.education, showlegend=False),
    row=1, col=1)

fig.add_trace(
    go.Scatter(x=high_ed.education, y=high_ed.nattacks, mode='markers',
              marker_color=high_ed.education, showlegend=False),
    row=1, col=2)

fig.update_layout(
    height=500, width=1000, 
    title_text="Comparing Low Education vs High Education",
    showlegend=True,
    title_x=0.5)


fig.update_xaxes(title_text="% Women Completed Secondary School", row=1, col=1)
fig.update_xaxes(title_text="% Women Completed Secondary School", row=1, col=2)

fig.update_yaxes(title_text="Number of Terrorist Events", row=1, col=1)
fig.update_yaxes(title_text="Number of Terrorist Events", range=[-120, 2000], row=1, col=2)

# Add regression line
fig.add_trace(go.Scatter(name='Terrorist Events', x=X_1, y=low_ed['low_ed_bestfit'], mode='lines',
                        line=dict(color='#E48446' , width=3), showlegend=False), row=1, col=1)
fig.add_trace(go.Scatter(name='Terrorist Events', x=X_2, y=high_ed['high_ed_bestfit'], mode='lines',
                        line=dict(color='#E48446', width=3)), row=1, col=2)
fig.show()

#### Distribution of education

In [142]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Histogram(x=low_ed.education, histnorm='probability', name='education under median'), row=1, col=1)
#     go.Scatter(x=low_ed.education, y=low_ed.nattacks, mode='markers',
#               marker_color=low_ed.education, text=low_ed.education),
#     row=1, col=1)

fig.add_trace(
    go.Histogram(x=high_ed.education, histnorm='probability', name='education over median'), row=1, col=2)
#     go.Scatter(x=high_ed.education, y=high_ed.nattacks, mode='markers',
#               marker_color=high_ed.education),
#     row=1, col=2)


fig.update_layout( 
    title_text="Percentage of Women With No Education",
    showlegend=True,
    title_x=0.5,
     bargap=0.03,
    bargroupgap=0.2
)

fig.update_xaxes(title_text="Distribution of Low Education by %", row=1, col=1)
fig.update_xaxes(title_text="Distribution of High Education by %", row=1, col=2)

fig.update_yaxes(title_text="% in Education Range",range=[0, .2], row=1, col=1)
fig.update_yaxes(title_text="% in Educated Range", range=[0, .2], row=1, col=2)

In [109]:
t_test = stats.ttest_ind(low_ed.nattacks, high_ed.nattacks)
pearson = stats.pearsonr(low_ed.nattacks, high_ed.nattacks)
spearman = stats.spearmanr(low_ed.nattacks, high_ed.nattacks)
print(f"Two Sample T Test: {t_test}")
print("_"*40)
print(f"Pearson R: {pearson}")
print("_"*40)
print(f"Spearman R: {spearman}")

Two Sample T Test: Ttest_indResult(statistic=-2.1930799844999016, pvalue=0.02850449730312051)
________________________________________
Pearson R: (0.02813622999663839, 0.5037333987787277)
________________________________________
Spearman R: SpearmanrResult(correlation=0.26111761662296834, pvalue=2.722479143850585e-10)


In [84]:
'''plot distributions'''


low_mean = np.mean(low_ed.nattacks)
high_mean = np.mean(high_ed.nattacks)

low_std = np.std(low_ed.nattacks)
high_std = np.std(high_ed.nattacks)

low_err = np.std(low_ed.nattacks) / len(low_ed)
high_err = np.std(high_ed.nattacks) / len(high_ed)

low_nums = stats.norm(low_mean, low_std).rvs(10000)
high_nums = stats.norm(high_mean, high_std).rvs(10000)

fig = go.Figure()
fig.add_trace(go.Histogram(x=low_nums, name="Low Education"))
fig.add_trace(go.Histogram(x=high_nums, name="High Education", opacity=0.6))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms

fig.update_layout(title_text='Simulating Low/High Education Attacks',
                  title_x=0.5,
                  xaxis_title_text='Number of Attacks',
                  yaxis_title_text='Frequency of Value',
                  showlegend=True,
                 )
fig.show()

In [61]:
# q_low = combined["total_attacks"].quantile(0.05)
# q_hi  = combined["total_attacks"].quantile(0.95)

# df_filtered = combined[(combined["total_attacks"] < q_hi) & (combined["total_attacks"] > q_low)]


In [101]:
stats.pearsonr(sec_df_merged.education, sec_df_merged.nattacks.fillna(0))

(-0.05391923329789123, 0.06951720479296539)

In [115]:
stats.pearsonr(thirty_sec_df.education, thirty_sec_df.nattacks.fillna(0))

(-0.1419334390062965, 0.019637219767777197)

In [114]:
stats.pearsonr(thirty_no_df.education, thirty_no_df.nattacks.fillna(0))

(0.16907059389746623, 0.005347766273072314)

#### Hypothesis Testing No Education

In [71]:
# Merge secondary education completed with terrorism dataframe
combined = create_combined_frame(df, no_df)

# Create a dataframe with an education column and number of attacks in 5 year intervals
lst_df = event_count(combined)

med = lst_df.education.median()
mean = lst_df.education.median()

# Dividing the frame between education bracets
# median used due to education skew
low_ed = lst_df[lst_df.education <= med]
high_ed = lst_df[lst_df.education > med]

In [75]:
Y_1 = low_ed.nattacks
X_1 = low_ed.education

Y_2 = high_ed.nattacks
X_2 = high_ed.education

# regression to create line on plots
reg = LinearRegression().fit(np.vstack(X_1), Y_1)
low_ed['low_ed_bestfit'] = reg.predict(np.vstack(X_1))

reg = LinearRegression().fit(np.vstack(X_2), Y_2)
high_ed['high_ed_bestfit'] = reg.predict(np.vstack(X_2))



In [138]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(x=low_ed.education, y=low_ed.nattacks, mode='markers',
              marker_color=low_ed.education, text=low_ed.education, name='education below median',
               showlegend=False),
    row=1, col=1)

fig.add_trace(
    go.Scatter(x=high_ed.education, y=high_ed.nattacks, mode='markers',
              marker_color=high_ed.education, name='education above median', showlegend=False),
    row=1, col=2)

fig.update_layout(
    height=500, width=1000, 
    title_text="Comparing Low Education vs High Education",
    showlegend=True,
    title_x=0.5)


fig.update_xaxes(title_text="% women with no education", row=1, col=1)
fig.update_xaxes(title_text="% women with no education", row=1, col=2)

fig.update_yaxes(title_text="Number of Terrorist Events", row=1, col=1)
fig.update_yaxes(title_text="Number of Terrorist Events", range=[-120, 2000], row=1, col=2)


fig.add_trace(go.Scatter(name='Terrorist Events', x=X_1, y=low_ed['low_ed_bestfit'], mode='lines',
                        line=dict(color='#E48446' , width=3), showlegend=False), row=1, col=1)
fig.add_trace(go.Scatter(name='Terrorist Events', x=X_2, y=high_ed['high_ed_bestfit'], mode='lines',
                        line=dict(color='#E48446', width=3)), row=1, col=2)
fig.show()

In [79]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Histogram(x=low_ed.education, histnorm='probability', name='education under median'), row=1, col=1)
#     go.Scatter(x=low_ed.education, y=low_ed.nattacks, mode='markers',
#               marker_color=low_ed.education, text=low_ed.education),
#     row=1, col=1)

fig.add_trace(
    go.Histogram(x=high_ed.education, histnorm='probability', name='education over median'), row=1, col=2)
#     go.Scatter(x=high_ed.education, y=high_ed.nattacks, mode='markers',
#               marker_color=high_ed.education),
#     row=1, col=2)


fig.update_layout( 
    title_text="Percentage of Women Completed Seconday Education",
    showlegend=True,
    title_x=0.5,
     bargap=0.05,
    bargroupgap=0.2
)

fig.update_xaxes(title_text="Distribution of Low Education by %", row=1, col=1)
fig.update_xaxes(title_text="Distribution of High Education by %", row=1, col=2)

fig.update_yaxes(title_text="Number of Terrorist Events",range=[0, .2], row=1, col=1)
fig.update_yaxes(title_text="Number of Terrorist Events", range=[0, .2], row=1, col=2)

In [110]:
t_test = stats.ttest_ind(low_ed.nattacks, high_ed.nattacks)
pearson = stats.pearsonr(low_ed.nattacks, high_ed.nattacks)
spearman = stats.spearmanr(low_ed.nattacks, high_ed.nattacks)
print(f"Two Sample T Test: {t_test}")
print("_"*40)
print(f"Pearson R: {pearson}")
print("_"*40)
print(f"Spearman R: {spearman}")

Two Sample T Test: Ttest_indResult(statistic=-2.1930799844999016, pvalue=0.02850449730312051)
________________________________________
Pearson R: (0.02813622999663839, 0.5037333987787277)
________________________________________
Spearman R: SpearmanrResult(correlation=0.26111761662296834, pvalue=2.722479143850585e-10)


In [87]:
'''plot distributions'''


low_mean = np.mean(low_ed.nattacks)
high_mean = np.mean(high_ed.nattacks)

low_std = np.std(low_ed.nattacks)
high_std = np.std(high_ed.nattacks)

low_err = np.std(low_ed.nattacks) / len(low_ed)
high_err = np.std(high_ed.nattacks) / len(high_ed)

low_nums = stats.norm(low_mean, low_std).rvs(10000)
high_nums = stats.norm(high_mean, high_std).rvs(10000)

fig = go.Figure()
fig.add_trace(go.Histogram(x=low_nums, name="Low Education"))
fig.add_trace(go.Histogram(x=high_nums, name="High Education", opacity=0.6))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms

fig.update_layout(title_text='Simulating Low/High Education Attacks',
                  title_x=0.5,
                  xaxis_title_text='Number of Attacks',
                  yaxis_title_text='Frequency of Value',
                  showlegend=True,
                 )
fig.show()