In [None]:
# Data
import pandas as pd
import numpy as np


# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# NLTK
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [None]:
#使用資料夾內特斯拉的資料集，並使用HEATMAP查找闕漏資料
df = pd.read_csv(r'/kaggle/input/tesla-autonomous-deaths-data-updated-2023/Tesla Deaths - Deaths.csv')
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
df.head(5)

In [None]:
#整理資料
#1.取有意義的欄位
#2.將空格填入"-"(補值)
#3.刪去空白資料(關於case的資料)

df = df.iloc[:,1:13]

for i in range(6,12):
    df[df.columns[i]] = df[df.columns[i]].fillna("-")
df = df.dropna()

In [None]:
#使用HEATMAP確認缺漏資料
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
df.head(5)

In [None]:
#4.將空缺位補上"0",其他直維持不變

for i in range(6,12):
    for b in range(len(df)):
        if "-"in df[df.columns[i]].astype(str).values[b]:
            df[df.columns[i]].values[b] = 0
        else:
            df[df.columns[i]].values[b] = df[df.columns[i]].values[b]
df.head(5)

In [None]:
#打印圖表
#觀察"死亡數"與"時"、"地"之關係
#相關之預測

In [None]:
#1.將date資料格式轉換
#2.設定圖表:日期與死亡關係

df['Date']= pd.to_datetime(df['Date'])
x=df['Date']
y=df[' Deaths '].astype(int)

plt.figure(figsize=(20,8))
plt.title("Number of accidents per day in the total year")
plt.xlabel("Date")
plt.ylabel("Number of events")
plt.bar(x,y)

plt.show()

In [None]:
#1.將date資料格式轉換
#2.設定圖表:年分、月份與死亡關係

df['year']= df['Date'].dt.year
df['month']= df['Date'].dt.month

x=df['year']
a=df['month']
y=df[' Deaths '].astype(int)
z=df[' Tesla driver '].astype(int)
p=df[' Tesla occupant '].astype(int)
q=df[' Other vehicle '].astype(int)
r=df[' Cyclists/ Peds '].astype(int)

plt.figure(figsize=(16,6))
plt.title("Number of deaths per year")
plt.xlabel("Year")
plt.ylabel("Number of deaths")

plt.figure(1)   
plt.bar(x,z,color="coral",width=0.5)
plt.bar(x,p,color="sandybrown",width=0.5)
plt.bar(x,q,color="tan",width=0.5)
plt.bar(x,r,color="darkkhaki",width=0.5)

plt.figure(figsize=(16,6))
plt.title("Number of deaths per month")
plt.xlabel("Month")
plt.ylabel("Number of deaths")

plt.figure(2)
plt.bar(a,z,color="coral",width=0.5)
plt.bar(a,p,color="sandybrown",width=0.5)
plt.bar(a,q,color="tan",width=0.5)
plt.bar(a,r,color="darkkhaki",width=0.5)

plt.legend() 
plt.show()

In [None]:
#1.將date資料格式轉換
#2.設定圖表:國家、州與死亡關係

x = df[" Country "].value_counts().index
y = df[" Country "].value_counts().values
a = df[" State "].value_counts().index
b = df[" State "].value_counts().values

plt.figure(1)
plt.figure(figsize=(20,6))
plt.bar(x,y,color='#e35f62',width=0.36)
plt.figure(2)
plt.figure(figsize=(20,6))
plt.bar(a,b,color='#e35f62',width=0.36)

plt.legend() 
plt.show()

#為什麼沒有圖利?
#有辦法縮減嗎?

In [None]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [None]:
# Set NLP model
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1]#偵測詞性標籤
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_text(text):
    wnl = WordNetLemmatizer()
    print(text)#只餵文字
    return [wnl.lemmatize(w, get_wordnet_pos(w)) for w in word_tokenize(text)]
    
# df[df["event_month"]==11]["Description"].value_counts()


data = df[" Description "].value_counts().index

lemmatized_data = [lemmatize_text(text) for text in data]
lemmatized_data

In [None]:
from nltk.corpus import wordnet

synonyms = []

for syn in wordnet.synsets("crash"):
    for i in syn.lemmas():
        synonyms.append(i.name())

print(set(synonyms))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')#去除停用詞
X = vectorizer.fit_transform([' '.join(text) for text in lemmatized_data])#用空白隔開字串(strings)
print(X.shape)

In [None]:

# from sklearn.pipeline import Pipeline, TransformerMixin
# from sklearn.neighbors import LocalOutlierFactor
# lof = LocalOutlierFactor(n_neighbors=2)
# a = lof.fit_predict(X)


# bool_list = np.array([True if i == 1 else False for i in a])
# print(X[bool_list].shape)
# X = X[bool_list]


n_clusters=5
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = n_clusters, random_state=0)#分成五群原因
kmeans.fit(X)
clusters = kmeans.predict(X)





In [None]:
clusters

In [None]:
len(clusters)

In [None]:
print(X)

In [None]:
#貼上cluster欄位
df.loc[:, "cluster"] = 0
for i, item in enumerate(df[" Description "]):
    for cluster, phrase in zip(clusters, data):
        if phrase == item:
            df["cluster"][i] = cluster
            break

# for i, cluster in enumerate(clusters):
#     df["cluster"][i] = cluster

In [None]:
df

In [None]:
len(df)

In [None]:
#把同一cluster值拽出來

accident = [ [] for i in range(8)]
print(accident)


'''
accident_0 = []
accident_1 = []
accident_2 = []
accident_3 = []
accident_4 = []
'''


In [None]:
for i, cluster in enumerate(clusters):
    accident[cluster].append(data[i])

'''
for i, cluster in enumerate(clusters):
    if cluster == 0:
        accident_0.append(data[i])
    elif cluster == 1:
        accident_1.append(data[i])
    elif cluster == 2:
        accident_2.append(data[i])
    elif cluster == 3:
        accident_3.append(data[i])
    elif cluster == 4:
        accident_4.append(data[i])
'''

    

In [None]:
df[" Description "].value_counts()

In [None]:
df[" Description "].value_counts().index

In [None]:
for i in df[" Description "].value_counts().index:
    print(i)

In [None]:
accident_score = [ [] for i in range(n_clusters)]

'''
accident_0_score = []
accident_1_score = []
accident_2_score = []
accident_3_score = []
accident_4_score = []
'''

def get_score_list(accident_n,accident_n_score):
    for i in range(len(accident_n)):#對每一個元素做某一事
        for j in range(len(df[" Description "].value_counts().index)):
            if df[" Description "].value_counts().index[j] == accident_n[i]:#還沒lemmertize
                accident_n_score.append(df[" Description "].value_counts().values[j])
                
for i in range(n_clusters):
    get_score_list(accident[i], accident_score[i])
'''              
get_score_list(accident_0,accident_0_score)
get_score_list(accident_1,accident_1_score)
get_score_list(accident_2,accident_2_score)
get_score_list(accident_3,accident_3_score)
get_score_list(accident_4,accident_4_score)
'''

In [None]:
accident[4][0:20]

In [None]:
accident[0][0:20]

In [None]:
accident[1][0:20]

In [None]:
'''
x = ["accident_0","accident_1","accident_2","accident_3","accident_4"]
y = [len(accident_0),len(accident_1),len(accident_2),len(accident_3),len(accident_4)]
'''


x = ["accident_"+str(i) for i in range(n_clusters)]
y = [len(accident[i]) for i in range(n_clusters)]



plt.figure(figsize=(20,10))
# plt.title("accident type's counts")
plt.xlabel("accident type")
plt.ylabel("counts of accident type")
for i in range(len(x)):
    height = y[i]
    plt.text(x[i], height + 0.25, '%.1f' %height, ha='center', va='bottom', size = 12)
plt.bar(x,y,color='#e35f62')

In [None]:
print("ac0 score: ",sum(accident_0_score)*len(accident_0_score))
print("ac1 score: ",sum(accident_1_score)*len(accident_1_score))
print("ac2 score: ",sum(accident_2_score)*len(accident_2_score))
print("ac3 score: ",sum(accident_3_score)*len(accident_3_score))
print("ac4 score: ",sum(accident_4_score)*len(accident_4_score))

In [None]:
x = ["ac3 score","ac4 score","ac1 score","ac2 score","ac0 score"]
y = [12838,3780,2397,648,418]
plt.figure(figsize=(20,8))
plt.ylabel("Score")
# plt.title("sorted accuracy type score")
for i in range(len(x)):
    height = y[i]
    plt.text(x[i], height + 0.25, '%.1f' %height, ha='center', va='bottom', size = 12)
plt.bar(x,y,color='#e35f62')

In [None]:


dff = df.reset_index()[' Deaths ']


from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler(feature_range=(0,1))
df1=scaler.fit_transform(np.array(dff).reshape(-1,1))

from sklearn.model_selection import train_test_split
X = df1
y = df1
train_data, test_data, _, _ = train_test_split(X,y,test_size=0.3, shuffle=False)

def create_dataset(dataset, time_step=1):
    dataX = []
    dataY = []
    for i in range(len(dataset)-time_step):
        a = dataset[i:(i+time_step), 0]   
        dataX.append(a)
        dataY.append(dataset[i + time_step, 0])
    return np.array(dataX), np.array(dataY)

time_step = 10
X_train, y_train = create_dataset(train_data, time_step)
X_test, y_test = create_dataset(test_data, time_step)


X_train =X_train.reshape(X_train.shape[0],X_train.shape[1] , 1)
X_test = X_test.reshape(X_test.shape[0],X_test.shape[1] , 1)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM


#LSTM魔法陣
model=Sequential()
model.add(LSTM(50,return_sequences=True,input_shape=(time_step,1)))#input10,output50
model.add(LSTM(50,return_sequences=True))#output50，input來自前面
model.add(LSTM(50))#output50，input來自前面


model.add(Dense(1))#收斂為1
model.compile(loss='mean_squared_error',optimizer='adam')#

model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=60,batch_size=64,verbose=1)

In [None]:
import tensorflow as tf
train_predict=model.predict(X_train)
test_predict=model.predict(X_test)

train_predict=scaler.inverse_transform(train_predict)
test_predict=scaler.inverse_transform(test_predict)

import math
from sklearn.metrics import mean_squared_error
math.sqrt(mean_squared_error(y_train,train_predict))
math.sqrt(mean_squared_error(y_test,test_predict))

In [None]:
### Plotting 
# shift train predictions for plotting
time_step=10
trainPredictPlot = np.empty_like(df1)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[time_step:len(train_predict)+time_step, :] = train_predict#平移
# shift test predictions for plotting
testPredictPlot = np.empty_like(df1)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(train_predict)+(time_step*2):len(df1), :] = test_predict


# plot baseline and predictions

plt.plot(df1),'blue')
plt.plot(trainPredictPlot,'red')
plt.plot(testPredictPlot,'green')
plt.show()