In [None]:
!unzip /content/drive/MyDrive/cleaned_data.zip

In [424]:
# Imports
import pandas as pd
import numpy as np

from datetime import date, datetime
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor, RandomForestRegressor
from sklearn.metrics import confusion_matrix, recall_score, precision_score, accuracy_score

In [425]:
ages_df = pd.read_csv('cleaned_data/db1.csv')
arrested_df = pd.read_csv('cleaned_data/db2.csv')
salary_df = pd.read_csv('cleaned_data/db3.csv')
phone_number_df = pd.read_csv('cleaned_data/db4.csv')
sus_terrorist = pd.read_csv('cleaned_data/train_data.csv')

In [426]:
ages_df = ages_df.drop('Unnamed: 0', axis=1)
arrested_df = arrested_df.drop('Unnamed: 0', axis=1)
salary_df = salary_df.drop('Unnamed: 0', axis=1)
phone_number_df = phone_number_df.drop('Unnamed: 0', axis=1)
sus_terrorist = sus_terrorist.drop('Unnamed: 0', axis=1)

In [427]:
siblings_df = ages_df.merge(ages_df,left_on=['father_id', 'mom_id'], right_on=['father_id', 'mom_id']).drop_duplicates().rename(columns={'id_x': 'id', 'id_y': 'sibling_id'})
siblings_df = siblings_df[siblings_df.id != siblings_df.sibling_id]

In [428]:
siblings = siblings_df.groupby('id')
eldest = siblings.min('birth_date_y').reset_index()
siblings_number = siblings.count().reset_index()[['id', 'sibling_id']]

In [429]:
train_X = pd.DataFrame(sus_terrorist['suspect_id']).rename(columns={'suspect_id': 'id'})
train_Y = pd.DataFrame(sus_terrorist['is_terrorist'])

In [430]:
eldest_terrorist_df = ages_df.merge(eldest, how='left', left_on='id', right_on='id')[['id', 'birth_date', 'father_id_x', 'sibling_id']].rename(columns={'father_id_x': 'father_id', 'sibling_id': 'eldest_id'}).fillna(0)

In [431]:
def age(born):
    born = born.split(' ')[0]
    born = datetime.strptime(born, "%Y-%m-%d").date()
    today = date.today()
    return today.year - born.year - ((today.month, 
                                      today.day) < (born.month, 
                                                    born.day))

In [432]:
ages_df['age'] = ages_df['birth_date'].apply(age)

In [433]:
# Terrorist eldest brother
eldest_terrorist_df = eldest_terrorist_df.merge(arrested_df, how='left', left_on='eldest_id', right_on='arrested_id')[['id', 'arrested_id']].fillna(0).rename(columns={'arrested_id': 'is_eldest_terrorist'})
eldest_terrorist_df.loc[(eldest_terrorist_df.is_eldest_terrorist != 0), 'is_eldest_terrorist'] = 100000

In [434]:
# Terrorist father
father_terrorist_df = ages_df.merge(arrested_df, how='left', left_on='father_id', right_on='arrested_id')[['id', 'arrested_id']].rename(columns={'arrested_id': 'is_father_terrorist'}).fillna(0)
father_terrorist_df.loc[(father_terrorist_df.is_father_terrorist != 0), 'is_father_terrorist'] = 1000000

In [435]:
# Number of siblings
temp = ages_df.copy()
siblings_number_df = temp.merge(siblings_number, how='left', left_on='id', right_on='id')[['id', 'sibling_id']].fillna(0).rename(columns={'sibling_id': 'siblings_number'})
siblings_number_df['siblings_number'] *= 1000

In [436]:
# Previous arrest
temp = ages_df.copy()
prev_arrest_df = temp.merge(arrested_df, how='left', left_on='id', right_on='arrested_id').fillna(0).rename(columns={'arrested_id': 'prev_arrest'})[['id', 'arrest_latitude', 'arrest_longitude', 'prev_arrest']]
prev_arrest_df.loc[(prev_arrest_df.prev_arrest != 0), 'prev_arrest'] = 1000000

In [437]:
terrorist_phones_df = arrested_df.merge(phone_number_df, right_on='id', left_on='arrested_id')[['id', 'phone']]

In [439]:
calls_to_terrorists = {'id': [], 'terrorist_calls_number': []}

for index, row in ages_df.iterrows():
    try:
        call_df = pd.read_csv('/content/cleaned_data/db5/{}_calls.csv'.format(row['id']))
        calls_to_terrorists['terrorist_calls_number'].append(len(call_df.merge(terrorist_phones_df, left_on='phone_number', right_on='phone').index) * 10000)
        calls_to_terrorists['id'].append(row['id'])
    except FileNotFoundError:
        calls_to_terrorists['terrorist_calls_number'].append(0)
        calls_to_terrorists['id'].append(row['id'])

In [440]:
# Calls from and to terrorists
calls_to_terrorists_df = pd.DataFrame(calls_to_terrorists)

In [441]:
train_X = train_X \
    .merge(eldest_terrorist_df, on='id') \
    .merge(father_terrorist_df, on='id') \
    .merge(siblings_number_df, on='id') \
    .merge(prev_arrest_df, on='id') \
    .merge(calls_to_terrorists_df, on='id') \
    .merge(ages_df, on='id') \
    .drop(['id', 'father_id', 'mom_id', 'birth_date'], axis=1)

In [442]:
model = AdaBoostRegressor(n_estimators=200)
model.fit(train_X, train_Y['is_terrorist'])

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=200, random_state=None)

In [443]:
test_data = pd.read_csv('cleaned_data/test_data.csv')

In [444]:
test_data = test_data.drop('Unnamed: 0', axis=1)

In [445]:
test_X = pd.DataFrame(test_data['suspect_id']).rename(columns={'suspect_id': 'id'})
test_X = test_X \
    .merge(eldest_terrorist_df, on='id') \
    .merge(father_terrorist_df, on='id') \
    .merge(siblings_number_df, on='id') \
    .merge(prev_arrest_df, on='id') \
    .merge(calls_to_terrorists_df, on='id') \
    .merge(ages_df, on='id') \
    .drop(['id', 'father_id', 'mom_id', 'birth_date'], axis=1)
test_Y = test_data['is_terrorist']

In [446]:
result = model.predict(test_X)

In [447]:
test_data['result'] = result

In [448]:
# confusion_matrix(test_data['is_terrorist'], test_data['result'])

In [449]:
# recall_score(test_data['is_terrorist'], test_data['result'])

In [450]:
# precision_score(test_data['is_terrorist'], test_data['result'])

In [451]:
# accuracy_score(test_data['is_terrorist'], test_data['result'])

In [452]:
score_model(test_data)

0.35

In [244]:
def score_model(prediction):
    top_predictions = prediction.sort_values('result', ascending=False).head(int(len(prediction.index) * 0.2))
    return len(top_predictions[top_predictions.is_terrorist == 1]) / len(top_predictions.index)