In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
#loading the data using pandas
df= pd.read_csv("/kaggle/input/us-accidents/US_Accidents_Dec21_updated.csv")
df.head(10)

In [None]:
# let us also check the aggregrating statistics for the numerical columns
df.describe()

# Data preperation and Cleaning


In [None]:
# dealing with missing values
missing= df.isnull().sum().sort_values(ascending=False)
percent= (df.isnull().sum()/df.notnull().count()*100).sort_values(ascending=False)
Total_missing_values= pd.concat([missing,percent],axis=1,keys=["missing","percent"])
Total_missing_values.head(15)



In [None]:
missing.head(12).plot(kind='barh')

from analysing the data we can see that certain columnns need to be dropped cause of the high number of missing values and certain columns are not relevant for further analysis
We can drop columns such as number. Also End_Lat& Lng could be dropped because the the distance variable can effectively replace them

In [None]:
#dropping the above mentioned variable.
new_df= df.drop(['Number','End_Lat','End_Lng'],axis="columns")
Total_missing_values[Total_missing_values["missing"]>1000]

In [None]:
new_df= new_df.dropna(subset=["Weather_Condition","Weather_Timestamp","Airport_Code","Timezone","Nautical_Twilight","Civil_Twilight",
                             "Sunrise_Sunset","Zipcode","Astronomical_Twilight",])
new_df["Wind_Speed(mph)"].interpolate(method= 'linear',limit_direction="forward",inplace=True)

In [None]:
# filling the missing values with mean and median values
new_df["Precipitation(in)"].fillna(new_df["Precipitation(in)"].median(),inplace=True)
new_df["Wind_Chill(F)"].fillna(new_df["Wind_Chill(F)"].mean(),inplace=True)
new_df["Temperature(F)"].fillna(new_df["Temperature(F)"].median(),inplace=True)

# Exploratory Analysis and Visualization

## Location(States, Cities and streets) based analysis:

In [None]:
new_df.columns

In [None]:
# 15 most accident prone states 
state_count15=new_df["State"].value_counts().sort_values(ascending=False)[:15]
sns.set_style('white')
sns.set(rc={'figure.figsize':(11,9)})
sns.barplot(x=state_count15.values,y= state_count15.index)
plt.xlabel('count')
plt.ylabel('State')
plt.title("Top 15 accident prone states")
plt.show()

In [None]:
#20 most accident prone cities.
city_count= new_df.City.value_counts()
fig,axs= plt.subplots(figsize=(13,7))
sns.countplot(x="City",order= city_count.index[:20],data= new_df,palette= 'mako')
plt.ylabel('Accidents')
plt.xticks(rotation=45)
plt.title('Top 20 accident prone Cities')
plt.show()


In [None]:
#Finding out which cities are the most accident prone among the accident prone states.
accident_states_df= new_df[new_df['State'].isin(state_count15.index.to_list())]
state_city_df=accident_states_df.groupby(["State","City"])["ID"].count().rename("Accident_counts")
state_city_df= state_city_df.reset_index().groupby(["State"],as_index=False).apply(lambda x: x.nlargest(5,'Accident_counts'))
state_city_df.set_index(keys=['State', 'City']).plot.barh(color=("tab:blue"), figsize=(15, 15))
plt.title("Number of accidents per state ,city")
plt.ylabel('Accident Count')
plt.show()


## Time Based analysis

In [None]:
new_df['Start_Time']= pd.to_datetime(new_df.Start_Time)

In [None]:
#Accidents by the hour
hr=new_df.Start_Time.dt.hour
sns.set(color_codes=True)
sns.set(style="dark", palette="muted")
sns.histplot(hr,bins= 12);

In [None]:
#Accident occurence by day of week
days=new_df.Start_Time.dt.dayofweek
fig, axs = plt.subplots(figsize=(10,8))
sns.distplot(days,kde=False,norm_hist=True,color='red');
plt.annotate('Weekdays',xy=(2.2,0.8))
plt.annotate('Weekends',xy=(5.2,1.0))
plt.xlabel('Days')
plt.ylabel('percentage of accidents')
plt.title('Accidents by Day of the Week')
plt.show();

In [None]:
#As we can see friday has the highest percentage of casualities. Lets take a closer look at Friday.
fridays=new_df[new_df.Start_Time.dt.dayofweek==4]
fig, ax = plt.subplots(figsize=(10,8))
sns.kdeplot(fridays.Start_Time.dt.hour,fill=True,bw_adjust=2,color='lightgreen',data=fridays);
plt.annotate('Peak',xy=(16,0.081),xytext=(2.5,0.062),arrowprops={'arrowstyle':'-|>'},fontsize= 14)
plt.title('Accidents On Friday')
plt.show()



In [None]:
new_df['Year']=new_df.Start_Time.dt.year


In [None]:
#Yearly Accident analysis
fig,axs= plt.subplots(figsize=(9,6))
sns.countplot(x='Year',data=new_df,palette = 'twilight_shifted' );
for i in axs.patches:
    count = '{:,.0f}'.format(i.get_height())
    x = i.get_x()+i.get_width()-0.70
    y = i.get_height()+11500
    axs.annotate(count, (x, y))
plt.title("Accidents By Year")
plt.show()

In [None]:
new_df['Month']=new_df.Start_Time.dt.month
new_df.Month= new_df.Month.map({1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'Jun',7:'Jul',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'})
new_df.Month.value_counts().sort_values()

In [None]:
#Monthly Analysis of accidents
fig,axs= plt.subplots(figsize=(12,10))
sns.countplot(x='Month', data=new_df,palette='RdYlGn')
for i in axs.patches:
    x= i.get_x()+i.get_width()-0.85
    y= i.get_height()+ 10000
    count= '{:,.0f}'.format(i.get_height())
    axs.annotate(count,(x,y))
axs.set(xlabel='Month',ylabel='Count',title='Accidents by month')
plt.show();


In [None]:
# severity analysis
fig,axs= plt.subplots(figsize=(9,6))
cmap=plt.get_cmap('tab10')
clr=cmap(np.arange(4))
accidents_severity= new_df.groupby('Severity').count()['ID']
plt.pie(accidents_severity,wedgeprops=dict(width=0.3),autopct='%.1f%%',colors= clr)
plt.legend(accidents_severity.index,loc='upper right')
plt.title('Degree of Severity')
plt.show();
#Shows the severity of the accident, a number between 1 and 4, where 1 indicates the least impact on traffic (i.e., short delay as a result of the accident) and 4 indicates a significant impact on traffic (i.e., long delay).



In [None]:
new_df.columns

## Weather based Analysis

In [None]:
weather_df = new_df.loc[:,'Temperature(F)':'Weather_Condition']
weather_df['Severity']=new_df['Severity']
weather_count=weather_df.Weather_Condition.value_counts()[:25]
weather_count

In [None]:
fig, ax = plt.subplots(figsize=(12,10))
weather_count.plot(kind='barh')
ax.set(title = 'Weather Conditions at Time of Accident ',
       xlabel = 'Accidents Count',
       ylabel = 'Weather')
plt.show()

In [None]:
len(weather_df.Wind_Direction.unique())

In [None]:
fig= plt.gcf()
fig.set_size_inches(15, 10);
#sns.displot(weather_df,x= 'Visibility(mi)',)
weather_df['Visibility(mi)'].value_counts().sort_values(ascending=False).head(12).plot.bar(width=0.65,edgecolor='k',align='center',linewidth=2.5)
plt.xlabel('Visibility(mi)',fontsize=14)
plt.ylabel('Count',fontsize= 14)
plt.title('Accidents count due to Visibility',fontsize=16)
plt.show();



In [None]:
fig,axs= plt.subplots(figsize=(11,7))
sns.set_theme(style="whitegrid")
sns.boxenplot(x="Severity", y="Wind_Chill(F)",data= weather_df);

In [None]:
sns.set_style("whitegrid", {'axes.grid' : False})
hum=sns.displot(weather_df,x='Humidity(%)',hue='Severity',bins=15,palette = 'winter_r',height=5.2,aspect=2)



In [None]:
sns.displot(weather_df, x="Pressure(in)", hue="Severity",kind='kde',palette='autumn', height=5, aspect=2);

In [None]:
#len(weather_df.Wind_Direction.unique())
g = sns.FacetGrid(weather_df, col='Severity');
g.map(sns.countplot, order=weather_df.Wind_Direction.value_counts().iloc[:8].index,x='Wind_Direction',data= weather_df);


In [None]:
g = sns.FacetGrid(weather_df, col='Severity');
g.map(sns.countplot, order=weather_df.Wind_Direction.value_counts().iloc[9:17].index,x='Wind_Direction',palette='coolwarm',data= weather_df);

In [None]:
g = sns.FacetGrid(weather_df, col='Severity');
g.map(sns.countplot, order=weather_df.Wind_Direction.value_counts().iloc[18:].index,x='Wind_Direction',palette='turbo',data= weather_df);


In [None]:
sns.pairplot(weather_df.sample(1000),hue ="Severity",corner=True,palette='seismic');

### Insights

* From location analysis it is evident that California is the most accident prone state followed by Florida and Texas.
* This is in accordance with their respective population with California being the most populous state in US.
* Miami, los angeles and Orlando make up the three most accident prone cities in the US.
* Accidents by the hour has 2 peaks,one during 6-10 am period and the notable peak during 2-7pm. This might be because, people leave and get back from work during these intervals, which results in greater movement of population and hence higher the chance of accidents.
* looking at the accidents by days of the week plot, we can see that weekdays have higher percentage of accident occurence when compared to weekends.
* Fridays record the highest percentage of accidents among all days. With the peak between 4-6pm.
* The yearly accident count reveals that 2021 is the most deadliest year on record followed by 2020 and 2019 respectively.
* Among the months december has the maximum number of accidents followed by november.
* The Severity analysis reveals that the vast majority of accidents are of type 2 -89.1%
* The vast majority of accidents occur at fair weather conditions.
