In [None]:
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt

df = pd.read_csv('/kaggle/input/us-accidents/US_Accidents_March23.csv')

In [None]:
df.info()

**Object (20 columns): ****These are likely to contain text or mixed data. Examples include 'ID', 'Source', 'Start_Time', 'End_Time', etc. These fields could be descriptive (like 'Description') or categorical (like 'City', 'State', 'Weather_Condition').

**Int64 (1 column):**** 'Severity' is an integer, which might represent the severity of incidents on a numeric scale.

**Float64 (12 columns):**** These columns contain numerical data with decimal points. Examples include geographical coordinates ('Start_Lat', 'Start_Lng'), distance ('Distance(mi)'), and various weather-related measurements like 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', etc.

**Bool (13 columns):**** These boolean fields represent True/False conditions. They are likely to indicate the presence or absence of certain features or conditions at the incident location, such as 'Amenity', 'Bump', 'Crossing', 'Junction', etc.


In [None]:
#inspect first few rows
df.head()

In [None]:
# scatter plot of 'Temperature(F)' & 'Severity'
sns.scatterplot(data=df, x='Temperature(F)', y='Severity')

# same plot with regression line
sns.regplot(data=df, x='Temperature(F)', y='Severity')

In [None]:
# format variables
X = df['Temperature(F)'].to_numpy().reshape(-1, 1)
y = df['Severity'].to_numpy().reshape(-1, 1)

# run regression
lm = LinearRegression()
model = lm.fit(X, y)

# view intercept
lm.intercept_

# view slope coefficient
lm.coef_

In [None]:
# Convert 'Start_Time' and 'End_Time' to datetime if you're working with these columns
# df['Start_Time'] = pd.to_datetime(df['Start_Time'])
# df['End_Time'] = pd.to_datetime(df['End_Time'])

# EDA: Scatter plot for 'Visibility(mi)' and 'Humidity(%)'
sns.scatterplot(x='Visibility(mi)', y='Humidity(%)', data=df)
plt.show()

# Correlation coefficient
correlation = df['Visibility(mi)'].corr(df['Humidity(%)'])
print(f'Correlation coefficient: {correlation}')

# Regression analysis
X = df[['Visibility(mi)']]  # Independent variable
y = df['Humidity(%)']  # Dependent variable
lm = LinearRegression()
lm.fit(X, y)

# Regression plot
sns.regplot(x='Visibility(mi)', y='Humidity(%)', data=df)
plt.show()


# Correlogram of the Data

In [None]:
plt.figure(figsize=(33,10), dpi= 80)
sns.heatmap(df.corr(), xticklabels=df.corr().columns, yticklabels=df.corr().columns, cmap='RdYlGn', center=0, annot=True)

# Decorations
plt.title('Correlogram of the Data', fontsize=22)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

# Correlation Analysis: 'Pressure(in)' vs 'Temperature(F)'

In [None]:
pressure_temp_corr = df['Pressure(in)'].corr(df['Temperature(F)'])
print(f'Correlation coefficient between Pressure and Temperature: {pressure_temp_corr}')

# Regression Analysis: 'Pressure(in)' vs 'Temperature(F)'

In [None]:
#'Pressure(in)' vs 'Temperature(F)'
X_pressure = df[['Pressure(in)']].to_numpy().reshape(-1, 1)
y_temp = df['Temperature(F)'].to_numpy().reshape(-1, 1)

#linear regression
lm_pressure_temp = LinearRegression()
lm_pressure_temp.fit(X_pressure, y_temp)

print(f'Intercept: {lm_pressure_temp.intercept_}')
print(f'Coefficient: {lm_pressure_temp.coef_}')

# Visualization: 'Pressure(in)' vs 'Temperature(F)'

In [None]:
# Plotting the regression line on a scatter plot
sns.regplot(x='Pressure(in)', y='Temperature(F)', data=df)
plt.show()

# Cross-tabulation: 'Traffic_Signal' & 'Crossing'

In [None]:
traffic_signal_crossing_crosstab = pd.crosstab(df['Traffic_Signal'], df['Crossing'])
print(traffic_signal_crossing_crosstab)

In [None]:
traffic_features_crosstab = pd.crosstab(df['Traffic_Calming'], [df['Bump'], df['Stop']])
sns.heatmap(traffic_features_crosstab, annot=True, fmt='d')
plt.show()

# Heatmap: 'Traffic_Calming', 'Bump', & 'Stop'

In [None]:
traffic_features_crosstab = pd.crosstab(df['Traffic_Calming'], [df['Bump'], df['Stop']])
sns.heatmap(traffic_features_crosstab, annot=True, fmt='d')
plt.show()

# Linear Regression: 'Junction' & 'Severity'

In [None]:
# Convert 'Junction' to numeric for linear regression
df['Junction'] = df['Junction'].astype(int)

# Processing data for linear regression
X_junction = df[['Junction']]  # Independent variable
y_severity = df['Severity']    # Dependent variable

# models
linear_model = LinearRegression()
linear_model.fit(X_junction, y_severity)

# Get the slope and intercept of the line
intercept = linear_model.intercept_
slope = linear_model.coef_[0]

print(f'Intercept: {intercept}, Slope: {slope}')

# Decision Trees

In [None]:
# Decision trees can handle categorical variables directly, but let's ensure it's in a proper format
df['Junction'] = df['Junction'].astype(int)

# Prepare data for the decision tree
X_junction = df[['Junction']]  # Independent variable
y_severity = df['Severity']    # Dependent variable

# Create and fit the model
tree_model = DecisionTreeRegressor()
tree_model.fit(X_junction, y_severity)

# Use the model to predict 'Severity' based on 'Junction'
predicted_severity = tree_model.predict(X_junction)
