In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

sns.set_theme()

# Read Data
data_files = [f"../data/{year}.csv" for year in range(2017, 2022)]
data_frames = []

for year, file in zip(range(2017, 2022), data_files):
    df = pd.read_csv(file, na_values='?')
    df['Year'] = year  # Add a 'Year' column with the corresponding year
    data_frames.append(df)
data = pd.concat(data_frames, ignore_index=True)

data.fillna(data.median(), inplace=True)
data = data.rename(columns={"class": "isBankrupt"})


In [12]:
# 选择用于聚类的特征列（这里假设使用所有的指标除了'Year'和'isBankrupt'）
features = data.columns.drop(['Year', 'isBankrupt'])

# 标准化数据
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data[features])

# 将标准化后的数据转换回DataFrame
data_scaled = pd.DataFrame(data_scaled, columns=features)
data_scaled['Year'] = data['Year']  # 保留年份信息

In [18]:
# 使用K-means聚类
kmeans = KMeans(n_clusters=1000, random_state=42)  # 假设有10个聚类
data_scaled['Cluster'] = kmeans.fit_predict(data_scaled[features])

# 分析聚类结果
clustered_data = pd.concat([data_scaled, data[['isBankrupt']]], axis=1)


  super()._check_params_vs_input(X, default_n_init=10)


In [19]:
# 检查每个簇的年份分布
clusters = data_scaled.groupby('Cluster')['Year'].unique()

# 为了简化展示，这里只打印每个簇的年份情况
for cluster_id, years in clusters.items():
    print(f"Cluster {cluster_id}: Years = {years}")

def check_clusters(data_scaled):
    clusters = data_scaled.groupby('Cluster')['Year'].unique()
    valid_clusters = []
    invalid_clusters = []

    for cluster_id, years in clusters.items():
        if len(years) == len(set(years)) and len(years) <= 5:
            valid_clusters.append(cluster_id)
        else:
            invalid_clusters.append(cluster_id)

    return valid_clusters, invalid_clusters

valid_clusters, invalid_clusters = check_clusters(data_scaled)
print("Valid Clusters:", valid_clusters)
print("Invalid Clusters:", invalid_clusters)


Cluster 0: Years = [2017 2018 2019 2020 2021]
Cluster 1: Years = [2017]
Cluster 2: Years = [2018]
Cluster 3: Years = [2018 2019]
Cluster 4: Years = [2020]
Cluster 5: Years = [2017]
Cluster 6: Years = [2019]
Cluster 7: Years = [2018]
Cluster 8: Years = [2021]
Cluster 9: Years = [2018]
Cluster 10: Years = [2018]
Cluster 11: Years = [2017]
Cluster 12: Years = [2017]
Cluster 13: Years = [2021]
Cluster 14: Years = [2019]
Cluster 15: Years = [2021]
Cluster 16: Years = [2019]
Cluster 17: Years = [2019]
Cluster 18: Years = [2018]
Cluster 19: Years = [2020]
Cluster 20: Years = [2020]
Cluster 21: Years = [2018]
Cluster 22: Years = [2018]
Cluster 23: Years = [2020]
Cluster 24: Years = [2019]
Cluster 25: Years = [2017]
Cluster 26: Years = [2018]
Cluster 27: Years = [2017]
Cluster 28: Years = [2020]
Cluster 29: Years = [2020]
Cluster 30: Years = [2021]
Cluster 31: Years = [2017]
Cluster 32: Years = [2017]
Cluster 33: Years = [2017 2018 2019 2020 2021]
Cluster 34: Years = [2021]
Cluster 35: Years = 

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# 准备训练数据
X = data_scaled[features]  # 使用标准化后的特征
y = data['isBankrupt']  # 目标变量

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 构建逻辑回归模型
model = LogisticRegression()
model.fit(X_train, y_train)

# 模型评估
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))


              precision    recall  f1-score   support

           0       0.95      1.00      0.97      8240
           1       0.50      0.01      0.02       441

    accuracy                           0.95      8681
   macro avg       0.72      0.50      0.50      8681
weighted avg       0.93      0.95      0.93      8681

[[8236    4]
 [ 437    4]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
