In [2]:
import pandas as pd
df=pd.read_excel('表格.xls',header=1)
print(df.head())

In [3]:
for i in df.columns:
    print(i)
    print(df[i].unique())

In [4]:
print(df.info())

In [5]:
print(df.columns)

In [6]:
columns=['SO2', 'CO(mg/m3)', 'NO2',
       'O3-1H',  'PM10', 'PM2.5', 'NO', 'NOX', '湿度', '温度', '风速', '风向',
        '大气压']

In [7]:

for col in columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [8]:
print(df.isnull().sum())

In [9]:
for col in columns:
    median_value = df[col].median()
    df[col] = df[col].fillna(median_value)

In [10]:
print(df.describe())

In [11]:
print(df.info())

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def replace_outliers_with_median_iqr(df, columns, iqr_scale=1.5):

    df_clean = df.copy()
    for col in columns:
        # 计算四分位数和 IQR
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        # 定义异常值边界
        lower_bound = Q1 - iqr_scale * IQR
        upper_bound = Q3 + iqr_scale * IQR
        # 标记异常值
        outlier_mask = (df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)
        # 计算中位数并填补
        median_value = df_clean[col].median()
        df_clean.loc[outlier_mask, col] = median_value
    return df_clean

In [13]:
columns_to_plot = ['SO2', 'CO(mg/m3)', 'NO2',  'PM10', 'PM2.5', 'NO', 'NOX']

df_cleaned = replace_outliers_with_median_iqr(df, columns_to_plot)

print("异常值填补后的数据摘要：")
print(df_cleaned.describe())

In [14]:
df=df_cleaned.copy()

In [15]:
import matplotlib.pyplot as plt
import pandas as pd
columns_to_plot = ['SO2', 'CO(mg/m3)', 'NO2',  'PM10', 'PM2.5', 'NO', 'NOX']
df['time']=pd.to_datetime(df['时段'], format='%Y/%m/%d %H:%M:%S')

plt.figure(figsize=(15, 8))

for col in columns_to_plot:
    plt.figure(figsize=(15, 8))
    plt.plot(df['time'], df[col], label=col)  

    plt.xlabel('Time', fontsize=12)
    plt.ylabel('Concentration', fontsize=12)
    plt.title('Air Quality Metrics Over Time', fontsize=14)
    plt.legend(loc='upper left', bbox_to_anchor=(1.0, 1.0))  
    plt.grid(alpha=0.3)

    plt.gcf().autofmt_xdate()
    
    plt.tight_layout()
    plt.show()

In [16]:
plt.figure(figsize=(16, 8))


for col in columns_to_plot:
    plt.plot(df['time'], df[col], label=col, alpha=0.7, linewidth=1)


plt.xlabel('Time', fontsize=12)
plt.ylabel('Standardized Value', fontsize=12) 
plt.title('Standardized Air Quality Metrics Over Time', fontsize=14)
plt.legend(loc='upper left', bbox_to_anchor=(1.01, 1))  
plt.grid(alpha=0.3)

plt.gcf().autofmt_xdate()

plt.tight_layout()
plt.show()

In [17]:
from sklearn.preprocessing import StandardScaler

columns_to_scale = ['SO2', 'CO(mg/m3)', 'NO2', 'PM10', 'PM2.5', 'NO', 'NOX']

scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[columns_to_scale]), 
                         columns=columns_to_scale, 
                         index=df.index)

df_scaled['time'] = df['time']

In [18]:
import matplotlib.pyplot as plt

# 
plt.figure(figsize=(16, 8))

for col in columns_to_scale:
    plt.plot(df_scaled['time'], df_scaled[col], label=col, alpha=0.7, linewidth=1)

plt.xlabel('Time', fontsize=12)
plt.ylabel('Standardized Value', fontsize=12)  
plt.title('Standardized Air Quality Metrics Over Time', fontsize=14)
plt.legend(loc='upper left', bbox_to_anchor=(1.01, 1))  # 图例放在右侧
plt.grid(alpha=0.3)

plt.gcf().autofmt_xdate()

plt.tight_layout()
plt.show()

In [19]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[columns_to_scale]), 
                         columns=columns_to_scale, 
                         index=df.index)
df_scaled['time'] = df['time']

In [20]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 8))

for col in columns_to_scale:
    plt.plot(df_scaled['time'], df_scaled[col], label=col, alpha=0.7, linewidth=1)

plt.xlabel('Time', fontsize=12)
plt.ylabel('Normalized Value', fontsize=12)  
plt.title('Standardized Air Quality Metrics Over Time', fontsize=14)
plt.legend(loc='upper left', bbox_to_anchor=(1.01, 1))  
plt.grid(alpha=0.3)

plt.gcf().autofmt_xdate()

plt.tight_layout()
plt.show()

In [21]:

df_subset = df[columns_to_plot]
corr_matrix = df_subset.corr()
print(corr_matrix)

In [22]:
import seaborn as sns

plt.figure(figsize=(12, 8))

sns.heatmap(
    data=corr_matrix,       
    annot=True,           
    fmt=".2f",              
    cmap='coolwarm',         
    linewidths=0.5,          
    linecolor='white',       
    vmin=-1, vmax=1         
)

plt.title('空气质量指标相关性热力图', fontsize=14)
plt.xticks(rotation=45, ha='right')  
plt.yticks(rotation=0)               

plt.tight_layout()
plt.show()