# Imports

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import functions as f

# Acquire my working file 

In [3]:
df = pd.read_csv('prepped_svc.csv')
# df.drop(columns= 'Unnamed: 0', inplace=True)

In [None]:
from sklearn.impute import SimpleImputer

def impute_missing_values(df, cols, strategy='most_frequent'):
    """
    Impute missing values in specified columns of a DataFrame using SimpleImputer.

    Parameters:
    - df (pandas DataFrame): The input DataFrame.
    - cols (list): List of column names to impute missing values.
    - strategy (str): The imputation strategy to use. Options: 'mean' (default), 'median', 'most_frequent', or 'constant'.

    Returns:
    - pandas DataFrame: The modified DataFrame with missing values imputed.
    """
    imputer = SimpleImputer(strategy=strategy)
    df_imputed = df.copy()

    # Fit the imputer on the specified columns and transform the missing values
    df_imputed[cols] = imputer.fit_transform(df_imputed[cols])

    return df_imputed


In [None]:
df = impute_missing_values(df, cols= df.columns.tolist(), strategy='most_frequent')

In [None]:
df.person_age = df.person_age.astype(int)

In [None]:
train, validate, test = f.split(df)

In [None]:
df.info()

## Is there a particular demographic/gender?-person

## Gender 

In [None]:
pd.crosstab(train.person_gender,train.person_injury_severity).plot(kind='bar', figsize=(20,20))
plt.savefig('gender.png')

In [None]:
train.person_gender.value_counts()

    Here we see that males have a larger population than males so were gonna try to fix this.

In [None]:
female_sample = train.loc[train.person_gender == '2 - female'].sample(500)
male_sample = train.loc[train.person_gender == '1 - male'].sample(500)

# Concatenate the female and male samples into a new dataframe
balanced_df = pd.concat([female_sample, male_sample])

In [None]:
balanced_df = balanced_df.sample(frac=1)

In [None]:
plt.figure(figsize=(12, 12))  # Adjust the dimensions as per your requirement
pd.crosstab(balanced_df.person_gender, balanced_df.person_injury_severity).plot(kind='bar')
plt.show()


    Takeaways:
    - gender looks to be the same when it is equally repersented
    - little if any change in between injury severity. 

In [None]:
pd.crosstab(balanced_df.person_gender, balanced_df.person_injury_severity)

In [None]:
from scipy.stats import chi2_contingency

# Create a contingency table
contingency_table = pd.crosstab(balanced_df.person_gender, balanced_df.person_injury_severity)

# Perform the chi-square test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Print the results
print("Chi-square test statistic:", chi2)
print("P-value:", p_value)
print("Degrees of freedom:", dof)


    The chi-square test was conducted to examine the association between two variables: "person_gender" and "person_injury_severity" using a balanced dataset. The results of the chi-square test are as follows:

    Chi-square test statistic: 8.45
    P-value: 0.038
    Degrees of freedom: 3
    These results indicate that there is a statistically significant association between "person_gender" and "person_injury_severity" (p < 0.05). The chi-square test statistic of 8.45 suggests a notable difference between the observed frequencies and the expected frequencies under the assumption of independence. With a p-value of 0.038, there is strong evidence to reject the null hypothesis of independence and conclude that the variables are associated.

    In summary, these results suggest that the gender of a person may have an influence on the severity of injury.

## Is there a particular age?-person

In [None]:
# Define the age buckets
age_bins = [0, 20, 40, 60, 100]  
age_labels = ['0-20', '21-40', '41-60', '61+']

# Create a new column with age buckets
train['age_group'] = pd.cut(train['person_age'], bins=age_bins, labels=age_labels)

# Generate the cross-tabulation and plot the bar chart
pd.crosstab(train['age_group'], train['person_injury_severity']).plot(kind='bar', figsize=(20,20))

plt.savefig('age_by-injury_severity.png')

In [None]:
cross_tab = pd.crosstab(train['person_age'], train['person_injury_severity'])

# Perform the chi-square test
chi2, p_val, dof, expected = chi2_contingency(cross_tab)

# Print the test results
print("Chi-square statistic:", chi2)
print("p-value:", p_val)
print("Degrees of freedom:", dof)

    The chi-square test was performed to examine the association between the age group of individuals and the severity of their injuries. The cross-tabulation of the variables 'person_age' and 'person_injury_severity' was analyzed.

    Chi-square statistic: 408.272
    Degrees of freedom: 237
    p-value: 3.104e-11
    The chi-square statistic value was found to be 408.272 with 237 degrees of freedom. This indicates a substantial deviation from the expected frequencies under the assumption of independence between the variables.

    The resulting p-value of 3.104e-11 suggests strong evidence against the null hypothesis of independence. Therefore, we reject the null hypothesis and conclude that there is a significant association between the age group and the severity of injuries among individuals in the dataset.

    This finding emphasizes the importance of considering the age group when evaluating injury severity, suggesting that age may be a contributing factor in determining the extent of injuries sustained.



# Does motorcycle endorsment have an effect type of injuries?

In [None]:
pd.crosstab(train.has_motocycle_endorsment, train.person_injury_severity).plot(kind='bar', figsize=(20,20))
plt.savefig('ME.png')

## Ethnicity ?

In [None]:
pd.crosstab(train.person_ethnicity, train.person_injury_severity).plot(kind='bar')

In [None]:
train.person_ethnicity.value_counts()

## Does where the fault of the crash lie matter?-person?-charges

    We will explore this at a later time 

# Has motorcycle endorsment 

In [None]:
pd.crosstab(train.has_motocycle_endorsment, train.person_injury_severity).plot(kind='bar')

In [None]:
pd.crosstab(train.person_ethnicity, train.person_helmet).plot(kind='bar')

# visualizations for narrative 

## 1 chart for age bins by crash severity

## 1 chart for ethnicity

In [None]:
pd.crosstab(train.person_ethnicity, train.person_injury_severity).plot(kind='bar', figsize=(20,20))
# plt.savefig('person_ethnicity.png')

## 1 chart for gender

## 1 chart for ME by crash severity

    (later well add: 1 clusterings of age bins by person helmet)

## Export csv for modeling:

In [None]:
df.columns

In [None]:
model_columns = ['person_age', 'person_ethnicity','person_gender','has_motocycle_endorsment', 'person_injury_severity','vehicle_body_style', 'vehicle_color', 'vehicle_defect_1', 'vehicle_make', 'vehicle_model_name', 'vehicle_model_year']

In [None]:
df[model_columns].to_csv('master_modeling.csv')