In [None]:
#检查基本信息：有无null值、数据类型、特征数量
import pandas as pd
data = pd.read_csv("../playground-series-s4e5/train.csv")
print(data.info())

In [None]:
data.drop("id", axis=1, inplace=True)
data.head()

In [None]:
#检查有无重复的行
data.drop_duplicates(inplace = True)
data.describe()

In [None]:
#分析每个特征的统计量
cols = data.columns.tolist()

def calculate_statistics(data,features):
    stats_dict = {}
    for col in features:
        feature = data[col]
        stats_dict[col] = {
            '均值': feature.mean(),
            '中位数': feature.median(),
            '标准差': feature.std(),
            '方差': feature.var(),
            '极差': feature.max() - data.min(),
            '最小值': feature.min(),
            '最大值': feature.max(),
            '偏度': feature.skew(),
            '峰度': feature.kurtosis(),
        }
    return pd.DataFrame(stats_dict).T

feature_stats = calculate_statistics(data, cols)
print("特征统计量:")
print(feature_stats.round(4))

In [None]:
import matplotlib.pyplot as plt
import numpy as np
plt.figure(figsize=(6, 2))
plt.hist(data.FloodProbability, bins=np.linspace(0.2825, 0.7275, 90), density=True)
plt.ylabel('density')
plt.xlabel('FloodProbability')
plt.show()

In [None]:
import numpy as np
import matplotlib as plt
from matplotlib.ticker import MaxNLocator
features = [col for col in data.columns if col != 'FloodProbability']
_, axs = plt.subplots(5, 4, figsize=(12, 12))
for col, ax in zip(features, axs.ravel()):
    vc = data[col].value_counts() / len(data)
    ax.bar(vc.index, vc)

    ax.set_title(col)
    ax.xaxis.set_major_locator(MaxNLocator(integer=True)) # only integer labels
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Function for automated univariate analysis
def univariate_analysis(data):
    for column in data.columns:
        print(f"Univariate Analysis for Column: {column}")
        print("--------------------------------------------------")
        print("Data Type:", data[column].dtype)
        print(data[column].describe())  # Summary statistics

        if data[column].dtype == 'object':
            # Categorical columns: Show value counts
            print("\nValue Counts:")
            print(data[column].value_counts())
            
            # Bar plot
            plt.figure(figsize=(8, 4))
            sns.countplot(y=data[column])
            plt.title(f'Bar Plot of {column}')
            plt.show()
        
        else:
            # Numeric columns: Show histogram and box plot
            plt.figure(figsize=(12, 5))

            # Histogram
            plt.subplot(1, 2, 1)
            data[column].plot(kind='hist', bins=10, color='skyblue', edgecolor='black')
            plt.title(f'Histogram of {column}')
            plt.xlabel(column)
            
            # Box plot
            plt.subplot(1, 2, 2)
            sns.boxplot(y=data[column])
            plt.title(f'Box Plot of {column}')
            
            plt.show()
        
        print("\n\n")

# Call the function
univariate_analysis(data)

In [None]:
correlation_matrix = data[cols].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
temp = data.FloodProbability.groupby(data[features].sum(axis=1)).mean()
plt.scatter(temp.index, temp, s=1, c=(temp.index.isin(np.arange(72, 76))), cmap='coolwarm')
plt.xlabel('sum')
plt.ylabel('flood probability')
plt.show()