In [2]:
import pandas as pd
import numpy as np
import random
import os

In [10]:
df = pd.read_csv("../../data/accident_data_with_vehicle_data_further_cleaned_w_stations.csv")

In [11]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.columns

Index(['Index', 'Longitude', 'Latitude', 'Accident_Severity', 'Day_of_Week',
       'Road_Type', 'Speed_limit', 'Road_Surface_Conditions',
       'Urban_or_Rural_Area', 'Year', 'Hour', 'Day_of_Year', 'Cluster_1',
       'Vehicle_Type', 'Sex_of_Driver', 'Age_of_Driver',
       'Engine_Capacity_(CC)', 'Age_of_Vehicle', 'Driver_Home_Area_Type',
       'Station ID'],
      dtype='object')

In [16]:
# number of clusters
n_clus = df['Cluster_1'].nunique()

In [17]:

def get_randoms_lists(column, n=len(df), reverse=False):
  counts = df[column].value_counts()
  counts = counts.to_dict()
  total = sum((map(int, counts.values())))
  d2 = {k: v/float(total) for k, v in counts.items()}
  if reverse:
    return list(np.random.choice(list(d2.keys()), n, p=list(reversed(list(d2.values())))))
  return list(np.random.choice(list(d2.keys()), n))

Day_of_Week = get_randoms_lists('Day_of_Week', reverse=False)
Speed_limit = get_randoms_lists('Speed_limit', reverse=False) 
Road_Surface_Conditions = get_randoms_lists('Road_Surface_Conditions', reverse=False)
Urban_or_Rural_Area = get_randoms_lists('Urban_or_Rural_Area', reverse=False)
Year = get_randoms_lists('Year', reverse=False)
Hour = get_randoms_lists('Hour', reverse=False)
Day_of_Year = get_randoms_lists('Day_of_Year', reverse=False)
Vehicle_Type = get_randoms_lists('Vehicle_Type', reverse=False)
Sex_of_Driver = get_randoms_lists('Sex_of_Driver', reverse=False)
Age_of_Driver = get_randoms_lists('Age_of_Driver', reverse=False)
Engine_Capacity = get_randoms_lists('Engine_Capacity_(CC)', reverse=False)
Driver_Home_Area_Type = get_randoms_lists('Driver_Home_Area_Type', reverse=False)


# negative sampling
N_SAMPLES = 1
df_new = df.head(0)
clusters = df['Cluster_1'].unique()
for i, cluster in enumerate(clusters):
    df_cluster = df.loc[df['Cluster_1'] == cluster]
    df_cluster_new = df.head(0)
    print('processing cluster:',str(i), ' of', str(n_clus))
    for index, row in df_cluster.iterrows():
        new_row = row
        # no accident means accident_severity = 0
        new_row['Accident_Severity'] = 0 
        # print('clus:',str(i), '\trow: ',str(index))
        for n in range(N_SAMPLES):
            is_present = True
            while(is_present):
                new_row['Year'] = Year.pop(0)
                new_row['Day_of_Year'] = Day_of_Year.pop(0)
                new_row['Hour'] = Hour.pop(0)
                
                # check if there is an accident record for new values
                is_acc = df_cluster.loc[(df_cluster['Day_of_Year'] == new_row['Day_of_Year']) & 
                                            (df_cluster['Hour'] == new_row['Hour']) &
                                            (df_cluster['Year'] == new_row['Year'])
                                            ]
                is_present = not is_acc.empty

                if(not is_present):
                    new_row['Speed_limit'] = Speed_limit.pop(0) 
                    new_row['Road_Surface_Conditions'] = Road_Surface_Conditions.pop(0)
                    new_row['Urban_or_Rural_Area'] = Urban_or_Rural_Area.pop(0)
                    new_row['Vehicle_Type'] = Vehicle_Type.pop(0)
                    new_row['Sex_of_Driver'] = Sex_of_Driver.pop(0)
                    new_row['Age_of_Driver'] = Age_of_Driver.pop(0)
                    new_row['Engine_Capacity_(CC)'] = Engine_Capacity.pop(0)
                    new_row['Driver_Home_Area_Type'] = Driver_Home_Area_Type.pop(0)
                else:
                    Year.append(new_row['Year'])
                    Day_of_Year.append(new_row['Day_of_Year'])
                    Hour.append(new_row['Hour'])
            
            df_cluster_new = df_cluster_new.append(row)
    df_new = df_new.append([df_cluster, df_cluster_new],ignore_index=True)   
    

df = df_new
df['Accident'] = ( df['Accident_Severity'] > 0) * 1

processing cluster: 0  of 38208
processing cluster: 1  of 38208
processing cluster: 2  of 38208
processing cluster: 3  of 38208
processing cluster: 4  of 38208
processing cluster: 5  of 38208
processing cluster: 6  of 38208
processing cluster: 7  of 38208
processing cluster: 8  of 38208
processing cluster: 9  of 38208
processing cluster: 10  of 38208
processing cluster: 11  of 38208
processing cluster: 12  of 38208
processing cluster: 13  of 38208
processing cluster: 14  of 38208
processing cluster: 15  of 38208
processing cluster: 16  of 38208
processing cluster: 17  of 38208
processing cluster: 18  of 38208
processing cluster: 19  of 38208
processing cluster: 20  of 38208
processing cluster: 21  of 38208
processing cluster: 22  of 38208
processing cluster: 23  of 38208
processing cluster: 24  of 38208
processing cluster: 25  of 38208
processing cluster: 26  of 38208
processing cluster: 27  of 38208
processing cluster: 28  of 38208
processing cluster: 29  of 38208
processing cluster: 

In [18]:
df.to_csv("../../data/accident_data_with_vehicle_data_further_cleaned_w_stations_n_non_accidents.csv", index=False)