In [None]:
#检查基本信息：有无null值、数据类型、特征数量
import pandas as pd
data = pd.read_csv("../playground-series-s4e5/train.csv")
print(data.info())

In [None]:
data.drop("id", axis=1, inplace=True)
data.head()

In [None]:
#检查有无重复的行
data.drop_duplicates(inplace = True)
data.describe()

In [None]:
#分析每个特征的统计量
cols = data.columns.tolist()

def calculate_statistics(data,features):
    stats_dict = {}
    for col in features:
        feature = data[col]
        stats_dict[col] = {
            '均值': feature.mean(),
            '中位数': feature.median(),
            '标准差': feature.std(),
            '方差': feature.var(),
            '极差': feature.max() - data.min(),
            '最小值': feature.min(),
            '最大值': feature.max(),
            '偏度': feature.skew(),
            '峰度': feature.kurtosis(),
        }
    return pd.DataFrame(stats_dict).T

feature_stats = calculate_statistics(data, cols)
print("特征统计量:")
print(feature_stats.round(4))

In [None]:
#数据可视化
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from numpy.polynomial.polynomial import Polynomial

# 设置中文字体和样式
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

fig, axes = plt.subplots(42, 1,figsize=(10, 8*46))

for i,col in enumerate(cols):
    sns.boxplot(y=data[col],ax=axes[i])

for i,col in enumerate(cols):
    if (col == "FloodProbability"):
        sns.histplot(data[col], kde=True, ax=axes[i+21],bins=(50 if col=="FloodProbability" else 20), color='skyblue')
        axes[i+21].set_title(f'{col} 分布图', fontsize=14, fontweight='bold')
        axes[i+21].set_xlabel(col)
        axes[i+21].set_ylabel('频数')
    else:
        sns.histplot(data[col],bins=data[col].max(),discrete=True,stat="count",color="skyblue", edgecolor="white",binrange=(0, data[col].max()),binwidth=1,ax=axes[i+21])
        axes[i+21].set_xticks(np.arange(0, data[col].max()+1, 1))
        axes[i+21].set_xlabel("Value")
        axes[i+21].set_ylabel("Count")
        axes[i+21].set_title(f"Distribution of {col}")
        values, counts = np.unique(data[cols[0]], return_counts=True)
        # 多项式拟合
        poly = Polynomial.fit(values, counts, deg=8)
        x_curve = np.linspace(0, data[col].max()-4, 100)
        y_curve = poly(x_curve)
        y_curve[y_curve <= 0] = np.nan
        # 绘制曲线
        axes[i+21].plot(x_curve, y_curve, color="blue", linewidth=2, label="Trend Line (Poly Fit)")
        
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Function for automated univariate analysis
def univariate_analysis(data):
    for column in data.columns:
        print(f"Univariate Analysis for Column: {column}")
        print("--------------------------------------------------")
        print("Data Type:", data[column].dtype)
        print(data[column].describe())  # Summary statistics

        if data[column].dtype == 'object':
            # Categorical columns: Show value counts
            print("\nValue Counts:")
            print(data[column].value_counts())
            
            # Bar plot
            plt.figure(figsize=(8, 4))
            sns.countplot(y=data[column])
            plt.title(f'Bar Plot of {column}')
            plt.show()
        
        else:
            # Numeric columns: Show histogram and box plot
            plt.figure(figsize=(12, 5))

            # Histogram
            plt.subplot(1, 2, 1)
            data[column].plot(kind='hist', bins=10, color='skyblue', edgecolor='black')
            plt.title(f'Histogram of {column}')
            plt.xlabel(column)
            
            # Box plot
            plt.subplot(1, 2, 2)
            sns.boxplot(y=data[column])
            plt.title(f'Box Plot of {column}')
            
            plt.show()
        
        print("\n\n")

# Call the function
univariate_analysis(data)

In [None]:
correlation_matrix = data[cols].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()