为了节省内存占用。数据预处理的部分如果不是在机器学习中才需要流处理的话，可能就先在清洗后变成csv文件。之后训练的时候再读取了。

In [21]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import networkx as nx
import chardet
import warnings
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore') # 在整个Python脚本执行期间忽略所有的警告。


In [22]:
combined_flight_data = pd.read_csv('merged_weatherr_Final.csv')

print(combined_flight_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142663 entries, 0 to 142662
Data columns (total 20 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Month              142663 non-null  int64  
 1   DayofMonth         142663 non-null  int64  
 2   DepTime            142663 non-null  float64
 3   ArrTime            142663 non-null  float64
 4   UniqueCarrier      142663 non-null  object 
 5   TailNum            142663 non-null  object 
 6   ArrDelay           142663 non-null  float64
 7   DepDelay           142663 non-null  float64
 8   Origin             142663 non-null  object 
 9   Dest               142663 non-null  object 
 10  Cancelled          142663 non-null  int64  
 11  CancellationCode   2603 non-null    object 
 12  CarrierDelay       142663 non-null  int64  
 13  WeatherDelay       142663 non-null  int64  
 14  NASDelay           142663 non-null  int64  
 15  SecurityDelay      142663 non-null  int64  
 16  La

In [23]:
print(combined_flight_data.isnull().sum())

Month                     0
DayofMonth                0
DepTime                   0
ArrTime                   0
UniqueCarrier             0
TailNum                   0
ArrDelay                  0
DepDelay                  0
Origin                    0
Dest                      0
Cancelled                 0
CancellationCode     140060
CarrierDelay              0
WeatherDelay              0
NASDelay                  0
SecurityDelay             0
LateAircraftDelay         0
TMAX                      0
TMIN                      0
PRCP                      0
dtype: int64


In [24]:
#需要再次填充缺失值
fillNone=['CancellationCode']
combined_flight_data[fillNone]=combined_flight_data[fillNone].fillna('None')


#首先将天气的数值除10并保留整数
divedecol=['TMAX','TMIN']
for col in divedecol:
    combined_flight_data[col]=combined_flight_data[col]/10
    combined_flight_data[col]=combined_flight_data[col].round(0)
    
intcol=['PRCP']
combined_flight_data[intcol]=combined_flight_data[intcol].astype(int)

#时间列变换为新格式
Timerols=['DepTime','ArrTime']
combined_flight_data[Timerols] = combined_flight_data[Timerols] // 100

print(combined_flight_data.info())

# combined_flight_data.to_csv('FinalWeather.csv',index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142663 entries, 0 to 142662
Data columns (total 20 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Month              142663 non-null  int64  
 1   DayofMonth         142663 non-null  int64  
 2   DepTime            142663 non-null  float64
 3   ArrTime            142663 non-null  float64
 4   UniqueCarrier      142663 non-null  object 
 5   TailNum            142663 non-null  object 
 6   ArrDelay           142663 non-null  float64
 7   DepDelay           142663 non-null  float64
 8   Origin             142663 non-null  object 
 9   Dest               142663 non-null  object 
 10  Cancelled          142663 non-null  int64  
 11  CancellationCode   142663 non-null  object 
 12  CarrierDelay       142663 non-null  int64  
 13  WeatherDelay       142663 non-null  int64  
 14  NASDelay           142663 non-null  int64  
 15  SecurityDelay      142663 non-null  int64  
 16  La

In [25]:
#后期转换标签编码的列，之后可能会拼接到模型中去.
transform_to_positive=['ArrDelay','DepDelay']

transform_to_cate=['CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay'] 



#将负数行转换成正数行并分类：
def trans_to_pos_cate(row,col,Refcol):
        if row[Refcol] == 0:
            if row[col] <= 0:
                return 'In Time'
            elif 0 < row[col] <= 20:
                return 'Slight'+col
            elif 20 < row[col] <= 60:
                return  'Medium'+col
            else:
                return 'Serious'+col
        else:
            return 'Cancel'

#将分类变量列都尝试转换为标签编码
#注意：由于输入数据的缘故。标签编码的规则还要适应新输入进来的数据集。因为输入的时候，数据的输入肯定是分类数据的。
for col in transform_to_positive:
    combined_flight_data[col]=combined_flight_data.apply(lambda row:trans_to_pos_cate(row,col,'Cancelled'), axis=1)
    print(combined_flight_data[col].head())


0    SlightArrDelay
1    SlightArrDelay
2    MediumArrDelay
3           In Time
4           In Time
Name: ArrDelay, dtype: object
0    SlightDepDelay
1    SlightDepDelay
2    MediumDepDelay
3           In Time
4           In Time
Name: DepDelay, dtype: object


In [26]:

#检查确认是否有空值行
print(combined_flight_data.isnull().sum())


Month                0
DayofMonth           0
DepTime              0
ArrTime              0
UniqueCarrier        0
TailNum              0
ArrDelay             0
DepDelay             0
Origin               0
Dest                 0
Cancelled            0
CancellationCode     0
CarrierDelay         0
WeatherDelay         0
NASDelay             0
SecurityDelay        0
LateAircraftDelay    0
TMAX                 0
TMIN                 0
PRCP                 0
dtype: int64


In [27]:

def combine_to_cate(row):
    last_five_cols=['CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay']
    if row['CancellationCode']=='None':
            if all(row[col] == 0 for col in last_five_cols):
                return 'In Time or bit late'
            #如果所有的列都为0则返回normal
            else:
                #最后五行的规则改进为取最大值的列
                max_col = max(last_five_cols, key=lambda col: row[col])
                return max_col
    else:
        return 'Cancelled'
    # elif row['CancellationCode']=='A':
    #     return 'CarrierCal'
    # elif row['CancellationCode']=='B':
    #     return 'WeatherCal'
    # elif row['CancellationCode']=='C':
    #     return 'NasCal'
    # elif row['CancellationCode']=='D':
    #     return 'SecurityCal'

combined_flight_data['ReasonForDelay']=combined_flight_data.apply(combine_to_cate,axis=1)


#测试新列是否形成
print(combined_flight_data['ReasonForDelay'].head(5))


combined_flight_data=combined_flight_data.drop(columns=['Cancelled','CancellationCode','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay'])



# combined_flight_data.to_csv('cleansamdata.csv',index=False)


0    In Time or bit late
1    In Time or bit late
2      LateAircraftDelay
3    In Time or bit late
4    In Time or bit late
Name: ReasonForDelay, dtype: object


In [28]:

print(combined_flight_data.shape)
print(combined_flight_data.info())


(142663, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142663 entries, 0 to 142662
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Month           142663 non-null  int64  
 1   DayofMonth      142663 non-null  int64  
 2   DepTime         142663 non-null  float64
 3   ArrTime         142663 non-null  float64
 4   UniqueCarrier   142663 non-null  object 
 5   TailNum         142663 non-null  object 
 6   ArrDelay        142663 non-null  object 
 7   DepDelay        142663 non-null  object 
 8   Origin          142663 non-null  object 
 9   Dest            142663 non-null  object 
 10  TMAX            142663 non-null  float64
 11  TMIN            142663 non-null  float64
 12  PRCP            142663 non-null  int32  
 13  ReasonForDelay  142663 non-null  object 
dtypes: float64(4), int32(1), int64(2), object(7)
memory usage: 14.7+ MB
None


In [29]:

print(combined_flight_data.isnull().sum())


Month             0
DayofMonth        0
DepTime           0
ArrTime           0
UniqueCarrier     0
TailNum           0
ArrDelay          0
DepDelay          0
Origin            0
Dest              0
TMAX              0
TMIN              0
PRCP              0
ReasonForDelay    0
dtype: int64


In [30]:
import shelve

# 标签编码列

objcolumns1=combined_flight_data.select_dtypes(include=['object']).columns

print(objcolumns1)

le=LabelEncoder()

encoders={}

for col in objcolumns1:
    combined_flight_data[col]=le.fit_transform(combined_flight_data[col])
    encoders[col] = {
        'encoder': le,
        'mapping': dict(zip(le.classes_, le.transform(le.classes_))),
        'inverse_mapping': {v: k for k, v in dict(zip(le.classes_, le.transform(le.classes_))).items()}
    }


with open('encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)





Index(['UniqueCarrier', 'TailNum', 'ArrDelay', 'DepDelay', 'Origin', 'Dest',
       'ReasonForDelay'],
      dtype='object')


In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import pickle


In [32]:
'''
改进：（1）舍弃了不必要的分类数据TailNum和24H（这个只是在后面作为对比的列），使得我们能够正确选取训练集X的特征
（2）在上面的模型中，不知道为什么Unique等分类列没有在get-dummies下变成独热编码的函数。这里在训练集中重新对其进行
了独特编码，使得模型能正确进入到随机森林模型当中。


问题：（1）在解决了特征选取等一系列问题之后，随机森林模型根本无法进行训练，并不清楚是模型本身的性能问题/计算机性能的问题，还是
数据处理的问题
'''
print(combined_flight_data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142663 entries, 0 to 142662
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Month           142663 non-null  int64  
 1   DayofMonth      142663 non-null  int64  
 2   DepTime         142663 non-null  float64
 3   ArrTime         142663 non-null  float64
 4   UniqueCarrier   142663 non-null  int32  
 5   TailNum         142663 non-null  int32  
 6   ArrDelay        142663 non-null  int32  
 7   DepDelay        142663 non-null  int32  
 8   Origin          142663 non-null  int32  
 9   Dest            142663 non-null  int32  
 10  TMAX            142663 non-null  float64
 11  TMIN            142663 non-null  float64
 12  PRCP            142663 non-null  int32  
 13  ReasonForDelay  142663 non-null  int32  
dtypes: float64(4), int32(8), int64(2)
memory usage: 10.9 MB
None


In [33]:

# print(cleaned_combined_flight_data.dtypes)

X = combined_flight_data.drop(columns=['ArrDelay','DepDelay','ReasonForDelay'])
target=['ArrDelay','DepDelay','ReasonForDelay']
y2=combined_flight_data[target]







# Split the data into training and testing sets

X_train, X_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)






In [34]:

print(y2.shape)


(142663, 3)


In [35]:

# Train a Random Forest model
#目前挺进到这里的数据转型和处理阶段
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model
from keras.layers import Input, Dense
from keras.utils import to_categorical
from tensorflow.keras.regularizers import l1_l2

# 随机森林
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y2_train)

In [36]:

# Make predictions
y_pred = model.predict(X_test)

In [37]:

# Define a function to decode the encoded values back to original

def encode_category(col, value):
    return encoders[col]['mapping'].get(value, None)

def decode_category(col, encoded_value):
    return encoders[col]['inverse_mapping'].get(encoded_value, None)

y2_test=pd.DataFrame(y2_test,columns=['ArrDelay','DepDelay','ReasonForDelay'])
y_pred=pd.DataFrame(y_pred,columns=['ArrDelay','DepDelay','ReasonForDelay'])

# print(y2_test.info())
# print(y_pred.info())

df_result = pd.DataFrame({
    'Actual_ArrDelay': [decode_category('ArrDelay', row) for row in y2_test['ArrDelay']],
    'Actual_DepDelay': [decode_category('DepDelay', row) for row in y2_test['DepDelay']],
    'Actual_ReasonForDelay': [decode_category('ReasonForDelay', row) for row in y2_test['ReasonForDelay']],
    'Predicted_ArrDelay': [decode_category('ArrDelay', row) for row in y_pred['ArrDelay']],
    'Predicted_DepDelay': [decode_category('DepDelay', row) for row in y_pred['DepDelay']],
    'Predicted_ReasonForDelay': [decode_category('ReasonForDelay', row) for row in y_pred['ReasonForDelay']]

})


In [41]:

print(df_result.head())

df_result.to_csv('resultRandom.csv',index=False)




   Actual_ArrDelay Actual_DepDelay Actual_ReasonForDelay Predicted_ArrDelay  \
0  SeriousArrDelay  MediumDepDelay          CarrierDelay    SeriousArrDelay   
1          In Time         In Time   In Time or bit late     SlightArrDelay   
2          In Time         In Time   In Time or bit late            In Time   
3   SlightArrDelay  SlightDepDelay   In Time or bit late            In Time   
4          In Time         In Time   In Time or bit late            In Time   

  Predicted_DepDelay Predicted_ReasonForDelay  
0     MediumDepDelay             CarrierDelay  
1            In Time      In Time or bit late  
2            In Time      In Time or bit late  
3            In Time      In Time or bit late  
4            In Time      In Time or bit late  


In [39]:
from sklearn.metrics import accuracy_score

# Calculate the accuracy of the model
#计算模型的准确性
accuracy0 = accuracy_score(y2_test['ArrDelay'], y_pred['ArrDelay'])
print(f'Accuracy: {accuracy0 * 100:.2f}%')

accuracy1 = accuracy_score(y2_test['DepDelay'], y_pred['DepDelay'])
print(f'Accuracy: {accuracy1 * 100:.2f}%')

accuracy2 = accuracy_score(y2_test['ReasonForDelay'], y_pred['ReasonForDelay'])
print(f'Accuracy: {accuracy2 * 100:.2f}%')


Accuracy: 63.95%
Accuracy: 70.41%
Accuracy: 81.38%


In [40]:
#在模型训练显示精度的最后才保存模型。并根据训练结果决定进不进行保存。
with open('flight_modelRaF.pkl', 'wb') as f:
    pickle.dump(model, f)