In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from utils.logger import logger

# Set style for better visualization
plt.style.use('seaborn-v0_8-darkgrid')
sns.set(font_scale=1.2)
plt.rcParams['figure.figsize'] = (16, 10)


In [None]:
# Load data from parquet file
file_path = "Data/downsampleData_scratch_1minut/Contacting_cleaned_1minut_20250802_170647.parquet"
df = pd.read_parquet(file_path)

# Check the first few rows of data
df.head()


In [None]:
# Check data info and missing values
print("Data shape:", df.shape)
print("\nColumns in the dataset:")
print(df.columns.tolist())
print("\nMissing values:")
print(df.isna().sum())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['DejaVu Sans', 'SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 设置图像样式
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries loaded successfully!")


In [None]:
# 加载数据
data_path = 'Data/downsampleData_scratch_1minut/Contacting_cleaned_1minut_20250802_170647.parquet'
print(f"Loading data from: {data_path}")

try:
    df = pd.read_parquet(data_path)
    print(f"Data loaded successfully! Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"\nFirst 5 rows:")
    print(df.head())
except Exception as e:
    print(f"Error loading data: {e}")


In [None]:
# 数据基本信息
print("数据基本信息:")
print(f"数据形状: {df.shape}")
print(f"\n数据类型:")
print(df.dtypes)
print(f"\n缺失值统计:")
print(df.isnull().sum())


In [None]:
# 确保TimeStamp是datetime类型
if 'TimeStamp' in df.columns:
    df['TimeStamp'] = pd.to_datetime(df['TimeStamp'])
    print(f"TimeStamp range: {df['TimeStamp'].min()} to {df['TimeStamp'].max()}")
    
    # 按时间排序
    df = df.sort_values('TimeStamp').reset_index(drop=True)
    print("Data sorted by TimeStamp")
else:
    print("Warning: TimeStamp column not found!")


In [None]:
# 选取前5个特征（除了TimeStamp）
feature_cols = [col for col in df.columns if col != 'TimeStamp']
top5_features = feature_cols[:5]

print(f"前5个特征: {top5_features}")

# 选取子集数据用于可视化（如果数据量太大，可以采样）
if len(df) > 10000:
    # 等间隔采样，保留时间趋势
    sample_step = len(df) // 10000
    df_viz = df.iloc[::sample_step].copy()
    print(f"数据量较大，采样显示。原始数据: {len(df)} 行，采样后: {len(df_viz)} 行")
else:
    df_viz = df.copy()
    print(f"使用全部数据进行可视化: {len(df_viz)} 行")


In [None]:
# 创建子图显示所有特征
fig, axes = plt.subplots(len(top5_features), 1, figsize=(15, 3*len(top5_features)))

if len(top5_features) == 1:
    axes = [axes]

colors = ['blue', 'green', 'red', 'orange', 'purple']

for i, feature in enumerate(top5_features):
    if feature in df_viz.columns:
        axes[i].plot(df_viz['TimeStamp'], df_viz[feature], 
                    color=colors[i % len(colors)], linewidth=0.8, alpha=0.8)
        axes[i].set_title(f'{feature} 随时间变化趋势', fontsize=14, fontweight='bold')
        axes[i].set_xlabel('时间', fontsize=12)
        axes[i].set_ylabel(feature, fontsize=12)
        axes[i].grid(True, alpha=0.3)
        axes[i].tick_params(axis='x', rotation=45)
        
        # 添加统计信息
        mean_val = df_viz[feature].mean()
        std_val = df_viz[feature].std()
        axes[i].axhline(y=mean_val, color='red', linestyle='--', alpha=0.7, 
                       label=f'Mean: {mean_val:.2f}')
        axes[i].legend(loc='upper right')
    else:
        axes[i].text(0.5, 0.5, f'特征 {feature} 不存在', 
                    transform=axes[i].transAxes, ha='center', va='center')

plt.tight_layout()
plt.show()

print("\n各特征统计信息:")
for feature in top5_features:
    if feature in df_viz.columns:
        print(f"{feature}: Mean={df_viz[feature].mean():.4f}, Std={df_viz[feature].std():.4f}, Min={df_viz[feature].min():.4f}, Max={df_viz[feature].max():.4f}")


In [None]:
# 在一个图中显示所有特征（标准化后）
fig, ax = plt.subplots(figsize=(15, 8))

for i, feature in enumerate(top5_features):
    if feature in df_viz.columns:
        # 标准化数据
        normalized_data = (df_viz[feature] - df_viz[feature].mean()) / df_viz[feature].std()
        ax.plot(df_viz['TimeStamp'], normalized_data, 
               label=feature, color=colors[i % len(colors)], 
               linewidth=1.2, alpha=0.8)

ax.set_title('前5个特征标准化后的时间序列对比', fontsize=16, fontweight='bold')
ax.set_xlabel('时间', fontsize=14)
ax.set_ylabel('标准化值', fontsize=14)
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# 如果存在异常标签，显示异常点分布
if 'anomaly_label' in df_viz.columns:
    anomaly_count = df_viz['anomaly_label'].sum()
    total_count = len(df_viz)
    anomaly_rate = anomaly_count / total_count * 100
    
    print(f"\n异常检测统计:")
    print(f"总样本数: {total_count}")
    print(f"异常样本数: {anomaly_count}")
    print(f"异常率: {anomaly_rate:.2f}%")
    
    # 显示异常点在时间轴上的分布
    fig, ax = plt.subplots(figsize=(15, 6))
    
    # 正常点
    normal_mask = df_viz['anomaly_label'] == 0
    anomaly_mask = df_viz['anomaly_label'] == 1
    
    ax.scatter(df_viz[normal_mask]['TimeStamp'], 
              np.zeros(normal_mask.sum()), 
              c='blue', alpha=0.6, s=10, label=f'正常点 ({normal_mask.sum()})')
    
    if anomaly_mask.sum() > 0:
        ax.scatter(df_viz[anomaly_mask]['TimeStamp'], 
                  np.ones(anomaly_mask.sum()), 
                  c='red', alpha=0.8, s=20, label=f'异常点 ({anomaly_mask.sum()})')
    
    ax.set_title('异常点时间分布', fontsize=16, fontweight='bold')
    ax.set_xlabel('时间', fontsize=14)
    ax.set_ylabel('异常标签', fontsize=14)
    ax.set_yticks([0, 1])
    ax.set_yticklabels(['正常', '异常'])
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("\n未找到异常标签列")


In [None]:
# 相关性热力图
if len(top5_features) > 1:
    fig, ax = plt.subplots(figsize=(10, 8))
    
    # 计算特征间的相关性
    correlation_matrix = df_viz[top5_features].corr()
    
    # 绘制热力图
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
                square=True, fmt='.3f', cbar_kws={'shrink': 0.8}, ax=ax)
    
    ax.set_title('前5个特征相关性热力图', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    print("\n特征间相关性:")
    print(correlation_matrix)
else:
    print("\n特征数量不足，无法计算相关性")


In [None]:
print("\n=== 数据可视化完成 ===")
print(f"已展示前5个特征: {top5_features}")
print("所有图表已在上方显示，无需保存文件")
