In [7]:
import pandas as pd  
import numpy as np  
from sklearn.impute import SimpleImputer  
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize  
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
df = pd.read_csv('fake_accident_data.csv')
# 数值型数据填充缺失值（例如使用均值）  
imputer = SimpleImputer(strategy='mean')  
df[['PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT']] = imputer.fit_transform(df[['PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT']])  
  
# 文本数据填充缺失值（例如使用空字符串）  
df['LOCATION'].fillna('', inplace=True)

In [9]:
# 数值型数据的异常值处理（例如使用IQR方法）  
Q1 = df['VEHCOUNT'].quantile(0.25)  
Q3 = df['VEHCOUNT'].quantile(0.75)  
IQR = Q3 - Q1  
df = df[(df['VEHCOUNT'] >= Q1 - 1.5 * IQR) & (df['VEHCOUNT'] <= Q3 + 1.5 * IQR)]  
  
df.drop_duplicates(inplace=True)

In [10]:
import nltk  
nltk.download('punkt')  # 用于分词  
nltk.download('stopwords')  # 用于停用词
# NLP处理函数（分词、去停用词）  
def preprocess_text(text):  
    tokens = word_tokenize(text)  
    stop_words = set(stopwords.words('english'))  
    filtered_text = [word for word in tokens if not word in stop_words]  
    return ' '.join(filtered_text)  
  
# 应用NLP处理函数到文本列  
df['LOCATION_PROCESSED'] = df['LOCATION'].apply(preprocess_text)  
  
# 词向量化处理（例如使用TF-IDF）  
vectorizer = TfidfVectorizer()  
location_tfidf = vectorizer.fit_transform(df['LOCATION_PROCESSED'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\24879\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\24879\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# 存储数值型数据和原始文本数据  
df.to_csv('preprocessed_data.csv', index=False)  


In [12]:
import pandas as pd  
  
# 加载预处理后的数据  
df = pd.read_csv('preprocessed_data.csv')

In [13]:
df.dtypes


OBJECTID                int64
REPORTNO               object
STATUS                 object
ADDRTYPE               object
LOCATION               object
COLLISIONTYPE          object
PERSONCOUNT           float64
PEDCOUNT              float64
PEDCYLCOUNT           float64
VEHCOUNT              float64
INJURIES                int64
SERIOUSINJURIES         int64
FATALITIES              int64
INCDATE                object
INCDTTM                object
WEATHER                object
ROADCOND               object
LIGHTCOND              object
SPEEDING               object
HITPARKEDCAR           object
LOCATION_PROCESSED     object
dtype: object

In [14]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns  
correlation_matrix = df[numerical_cols].corr()

In [15]:
# 识别数值列  
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns  
  
# 选择数值列并计算相关性矩阵  
correlation_matrix = df[numerical_cols].corr()  
  
# 打印相关性矩阵  
print(correlation_matrix)

                 OBJECTID  PERSONCOUNT  PEDCOUNT  PEDCYLCOUNT  VEHCOUNT  \
OBJECTID         1.000000     0.000352  0.042831    -0.000717  0.011049   
PERSONCOUNT      0.000352     1.000000  0.031991     0.041986 -0.012406   
PEDCOUNT         0.042831     0.031991  1.000000     0.044405  0.008279   
PEDCYLCOUNT     -0.000717     0.041986  0.044405     1.000000  0.046239   
VEHCOUNT         0.011049    -0.012406  0.008279     0.046239  1.000000   
INJURIES         0.057779     0.043021  0.033252    -0.014158  0.028863   
SERIOUSINJURIES -0.035462     0.050300  0.022874    -0.056013  0.029299   
FATALITIES      -0.021814     0.025310  0.031611    -0.037888  0.023138   

                 INJURIES  SERIOUSINJURIES  FATALITIES  
OBJECTID         0.057779        -0.035462   -0.021814  
PERSONCOUNT      0.043021         0.050300    0.025310  
PEDCOUNT         0.033252         0.022874    0.031611  
PEDCYLCOUNT     -0.014158        -0.056013   -0.037888  
VEHCOUNT         0.028863         0.029

In [16]:
print(df.columns)

Index(['OBJECTID', 'REPORTNO', 'STATUS', 'ADDRTYPE', 'LOCATION',
       'COLLISIONTYPE', 'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT',
       'INJURIES', 'SERIOUSINJURIES', 'FATALITIES', 'INCDATE', 'INCDTTM',
       'WEATHER', 'ROADCOND', 'LIGHTCOND', 'SPEEDING', 'HITPARKEDCAR',
       'LOCATION_PROCESSED'],
      dtype='object')


In [17]:
import pandas as pd  
from scipy import stats  
  
# 读取数据  
data = pd.read_csv('preprocessed_data.csv')  
  
# 选择变量  
person_count = data['PERSONCOUNT']  
veh_count = data['VEHCOUNT']  
  
# 执行假设检验  
correlation, p_value = stats.pearsonr(person_count, veh_count)  
  
# 打印结果  
print(f"皮尔逊相关系数: {correlation:.4f}")  
print(f"p值: {p_value:.4f}")  
  
# 解释结果  
alpha = 0.05  # 显著性水平  
if p_value < alpha:  
    print("我们拒绝原假设,认为PERSONCOUNT和VEHCOUNT之间存在显著关系。")  
else:  
    print("我们不能拒绝原假设,没有足够的证据表明PERSONCOUNT和VEHCOUNT之间存在显著关系。")

皮尔逊相关系数: -0.0124
p值: 0.6952
我们不能拒绝原假设,没有足够的证据表明PERSONCOUNT和VEHCOUNT之间存在显著关系。


In [18]:
import pandas as pd  
  
# 加载数据  
data = pd.read_csv('preprocessed_data.csv')

In [19]:
from sklearn.ensemble import RandomForestClassifier  
from sklearn.model_selection import train_test_split  
  
# 假设最后一列是目标变量，其余列是特征变量  
X = data.iloc[:, :-1]  
y = data.iloc[:, -1]  
  
# 划分训练集和测试集  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  
  
# 创建随机森林分类器实例  
rf_classifier = RandomForestClassifier(random_state=42)  
  
# 训练模型  
rf_classifier.fit(X_train, y_train)

ValueError: could not convert string to float: 'REPORT30'

In [None]:
import pandas as pd  
import numpy as np  
from sklearn.model_selection import train_test_split, cross_val_score  
from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import confusion_matrix, classification_report  
from sklearn.cluster import KMeans  
from sklearn.preprocessing import LabelEncoder  
from mlxtend.preprocessing import TransactionEncoder  
from mlxtend.frequent_patterns import apriori, association_rules  
  
# 1. 数据加载与预处理  
data = pd.read_csv('yiyan_file_input.csv')  
data['INCDATE'] = pd.to_datetime(data['INCDATE']).dt.date  
data['INCDTTM'] = pd.to_datetime(data['INCDTTM'])  
  
# 假设我们需要对分类变量进行编码  
le = LabelEncoder()  
cat_cols = ['STATUS', 'ADDRTYPE', 'COLLISIONTYPE', 'WEATHER', 'ROADCOND', 'LIGHTCOND', 'SPEEDING', 'HITPARKEDCAR']  
for col in cat_cols:  
    data[col] = le.fit_transform(data[col])  
  
# 2. 数据分析与建模  
# 选择特征和目标变量  
X = data.drop('REPORTNO', axis=1)  # 假设我们用除了REPORTNO之外的所有字段来预测STATUS  
y = data['STATUS']  
X = X.drop('STATUS', axis=1)  
  
# 划分数据集  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  
  
# 构建随机森林模型  
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)  
rf_clf.fit(X_train, y_train)  
  
# 交叉验证  
scores = cross_val_score(rf_clf, X_train, y_train, cv=5)  
print(f"Cross-validation scores: {scores.mean()} ± {scores.std()}")  
  
# 评估模型  
y_pred = rf_clf.predict(X_test)  
print("Confusion Matrix:")  
print(confusion_matrix(y_test, y_pred))  
print("\nClassification Report:")  
print(classification_report(y_test, y_pred))  
  
# 3. 聚类分析  
kmeans = KMeans(n_clusters=3, random_state=42)  
X_cluster = data.drop(['REPORTNO', 'STATUS', 'INCDATE', 'INCDTTM'], axis=1)  # 假设我们不使用这些字段进行聚类  
kmeans.fit(X_cluster)  
data['Cluster'] = kmeans.labels_  
  
# 4. 关联规则挖掘  
# 为了关联规则挖掘，我们需要将数据转换为交易格式  
te = TransactionEncoder()  
te_ary = te.fit(data.drop(['OBJECTID', 'REPORTNO', 'INCDATE', 'INCDTTM'], axis=1).apply(lambda x: x.astype(str)).values.tolist())  
df_te = pd.DataFrame(te_ary, columns=te.columns_)  
  
# 使用Apriori算法找出频繁项集  
frequent_itemsets = apriori(df_te, min_support=0.05, use_colnames=True)  
  
# 生成关联规则  
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)  
print(rules)

In [None]:
#随机森林分类器建模与评估  
# 划分训练集和测试集  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  
# 创建随机森林分类器  
clf = RandomForestClassifier(n_estimators=100, random_state=42)  
# 交叉验证评估模型性能  
scores = cross_val_score(clf, X_train, y_train, cv=5)  
print(f"Cross-validation scores: {scores.mean()} ± {scores.std()}")  
# 训练模型  
clf.fit(X_train, y_train)  
# 预测测试集  
y_pred = clf.predict(X_test)  
# 输出混淆矩阵和分类报告  
print("Confusion Matrix:")  
print(confusion_matrix(y_test, y_pred))  
print("\nClassification Report:")  
print(classification_report(y_test, y_pred)) 

In [None]:
# K均值聚类分析  
# 选择部分特征进行聚类  
clustering_data = data[['PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT']]  
# K均值聚类  
kmeans = KMeans(n_clusters=3, random_state=42)  
kmeans.fit(clustering_data)  
# 添加聚类标签到原始数据  
data['Cluster'] = kmeans.labels_  

In [None]:
# Apriori关联规则挖掘  
# 选择需要进行关联规则挖掘的特征，这里以是否超速和是否撞到停放的车辆为例  
transactions = data[['SPEEDING', 'HITPARKEDCAR']].applymap(lambda x: 'Y' if x == 'Y' else None).dropna(how='all')  
# 将数据转换为适合Apriori算法的格式  
te = TransactionEncoder()  
te_ary = te.fit(transactions).transform(transactions)  
df = pd.DataFrame(te_ary, columns=te.columns_)  
# 使用Apriori算法找出频繁项集  
frequent_itemsets = apriori(df, min_support=0.07, use_colnames=True)  
# 生成关联规则  
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)  
# 输出关联规则  
print("Association Rules:")  
print(rules)