In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from catboost import CatBoostClassifier

In [None]:
train = pd.read_csv("flight-delays-fall-2018/flight_delays_train.csv.zip", compression='zip')
test = pd.read_csv("flight-delays-fall-2018/flight_delays_test.csv.zip", compression='zip')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print(train.info())

In [None]:
print(test.info())

In [None]:
train.describe()

In [None]:
all_data = pd.concat([train, test], ignore_index=True)
all_data.sample(15)

In [None]:
# change target name to make it easier
train = train.rename(columns={'dep_delayed_15min':'delayed'})
all_data = all_data.rename(columns={'dep_delayed_15min':'delayed'})

In [None]:
# change target to numerical N-->0 & Y-->1
train.loc[(train.delayed == 'N'), 'delayed'] = 0
train.loc[(train.delayed == 'Y'), 'delayed'] = 1
all_data.loc[(all_data.delayed == 'N'), 'delayed'] = 0
all_data.loc[(all_data.delayed == 'Y'), 'delayed'] = 1

In [None]:
train['DayofMonth'] = train['DayofMonth'].str.split('-').str[1]
train['Month'] = train['Month'].str.split('-').str[1]
train['DayOfWeek'] = train['DayOfWeek'].str.split('-').str[1]

all_data['DayofMonth'] = all_data['DayofMonth'].str.split('-').str[1]
all_data['Month'] = all_data['Month'].str.split('-').str[1]
all_data['DayOfWeek'] = all_data['DayOfWeek'].str.split('-').str[1]

In [None]:
all_data

In [None]:
delayed_count_1 = (all_data['delayed'] == 1).sum()
delayed_count_0 = (all_data['delayed'] == 0).sum()

print(f"The count of '1's in 'delayed' column is: {delayed_count_1}")
print(f"The count of '0's in 'delayed' column is: {delayed_count_0}")


plt.bar(['Not Delayed', 'Delayed'], [delayed_count_0, delayed_count_1], color=['blue', 'red'])
plt.title('Flight Delay Status Count')
plt.xlabel('Status')
plt.ylabel('Count')

plt.show()

In [None]:
all_data.columns

In [None]:
order = range(1, 13)
fig , ax = plt.subplots(1, 2, figsize=(8,2))
sns.countplot(data=train, x='Month', order=order, ax=ax[0])
ax[0].set_title('Nb of flights by month')
sns.countplot(data=train, x='Month', hue='delayed', order=order, ax=ax[1])
ax[1].set_title('Delayed/Not delayed flights by month')
plt.figure(figsize=(8,2))
sns.barplot(data=train, x = 'Month', y = 'delayed',order=order )

plt.show()

我们可以看到，所有月份的航班数量和延误数量几乎相同。不过，六月、七月和十二月的延迟率略高，可能是由于假期原因。

In [None]:
order = range(1, 32)

fig, ax = plt.subplots(3, 1, figsize=(8,8))
sns.countplot(x='DayofMonth', data=train, ax=ax[0],order=order)
ax[0].set_title('Nb of flights by day of month')
sns.countplot(x='DayofMonth', hue='delayed', data=train, ax=ax[1],order=order)
ax[1].set_title('Delayed/not Delayed flight by day of month')
sns.barplot(x='DayofMonth', y='delayed', data=train, ax=ax[2], order=order)
ax[2].set_title('Rate of delayed flights by day of month')

plt.tight_layout()
plt.show()

同样，很难说每个月的日子之间是否存在很大差异但是，我们可以说，在该月的最后几天，延迟率较高

In [None]:
order = range(1,8)

fig, ax = plt.subplots(1, 3, figsize=(11,3))
sns.countplot(x='DayOfWeek', data=train, ax=ax[0],order=order)
ax[0].set_title('Nb of flights by day of week')
sns.countplot(x='DayOfWeek', hue='delayed', data=train, ax=ax[1],order=order)
ax[1].set_title('Delayed or not flights by day of week')
sns.barplot(x='DayOfWeek', y='delayed', data=train, ax=ax[2],order=order)   
ax[2].set_title('Rate of delayed flights by day of week')

plt.tight_layout()
plt.show()

在这里我们可以看到，周四和周五的航班延误率最高，而周二、周三和周六的航班延误率最低

In [None]:
plt.hist(train.DepTime)
plt.xlabel('Departure Time')

由于值范围很大，一旦我们对它进行分类，我们就会回到这个变量

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(8,8))
sns.countplot(x='UniqueCarrier', data=train, ax=ax[0])
ax[0].set_title('Nb of flights per unique carrier')
sns.countplot(x='UniqueCarrier', hue='delayed', data=train, ax=ax[1])
ax[1].set_title('Nb of delayed/not flights by unique carrier')
sns.barplot(x='UniqueCarrier',y= 'delayed', data=train, ax=ax[2])
ax[2].set_title('Rate of delayed flights by unique carrier')

plt.tight_layout()
plt.show()

我们可以看到UniqueCarrier变量对于延迟有很好的作用

In [None]:
# 指定更多的分箱数量
plt.hist(train.Distance, bins=100)
plt.xlabel('Distance')
plt.show()

我们可以看到，，大多数航班的距离都很短，不到1000英里，标准化和/或缩放此变量是个好主意吗？或者这样差异是否更有意义？也许bin这个变量

In [None]:
all_data.columns

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.hist(all_data['DepTime'], bins=24, alpha=0.6, label='All Flights')
plt.hist(all_data[all_data['delayed'] == 1]['DepTime'], bins=24, alpha=0.6, label='Delayed Flights')
plt.title('Departure Time Distribution')
plt.legend()
plt.show()

plt.scatter(all_data['DepTime'], all_data['UniqueCarrier'], c=all_data['delayed'], cmap='coolwarm', alpha=0.5)
plt.colorbar(label='Delayed')
plt.title('Scatter Plot of Unique Carriers vs Departure Time with Delay Colormap')
plt.xlabel('Departure Time')
plt.ylabel('Unique Carrier')
plt.show()

In [None]:
all_data['Month'] = all_data['Month'].astype(int)
all_data['DayofMonth'] = all_data['DayofMonth'].astype(int)
all_data['DayOfWeek'] = all_data['DayOfWeek'].astype(int)
# 确保其他布尔列已经被转换为0和1
all_data.replace(to_replace=[False, True], value=[0, 1], inplace=True)

In [None]:
all_data

In [None]:
all_data['flight'] = all_data['Origin'] + '->' + all_data['Dest']

In [None]:
from sklearn.preprocessing import LabelEncoder

# 将分类变量编码为数值变量
label_encoder = LabelEncoder()
all_data['UniqueCarrier'] = label_encoder.fit_transform(all_data['UniqueCarrier'])
all_data['Origin'] = label_encoder.fit_transform(all_data['Origin'])
all_data['Dest'] = label_encoder.fit_transform(all_data['Dest'])

all_data['flight'] = label_encoder.fit_transform(all_data['flight'])

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

correlation_matrix = all_data[all_data.columns].corr()

# 使用热力图可视化相关系数矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True, cbar_kws={"shrink": .5})
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
all_data

In [None]:
from sklearn.preprocessing import StandardScaler

# 初始化StandardScaler对象
scaler = StandardScaler()

# 提取distance和deptime列的数据，创建一个新数据框
to_scale = all_data[['Distance', 'DepTime']]

# 使用scaler对象对这两列数据进行标准化
scaled_data = scaler.fit_transform(to_scale)

# 将标准化后的数据转换回DataFrame并替换原有的列
all_data[['Distance', 'DepTime']] = pd.DataFrame(scaled_data, columns=['Distance', 'DepTime'])

# 检查标准化后的数据
print(all_data.head())


In [None]:
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

In [None]:
from scipy import stats
from scipy.stats import norm, skew 
plt.style.use('fivethirtyeight')    

def draw_dist_prob(data):    
    fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(24, 12), dpi=300)
    
    for i,j in enumerate(['Distance', 'DepTime']):
        sns.distplot(data[j], fit=norm, ax=ax[0][i])
        (mu, sigma) = norm.fit(data[j])
        ax[0][i].legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],loc='best')
        ax[0][i].set_ylabel('数量')
        ax[0][i].set_title('{} 频数图'.format(j))
    
        stats.probplot(data[j], plot=ax[1][i])

draw_dist_prob(all_data)

In [None]:
new_train = all_data.iloc[:100000]
new_test = all_data.iloc[100000:]

In [None]:
pd.DataFrame([i for i in zip(new_train.columns,new_train.skew(),new_train.kurt())],
             columns=['特征','偏度','峰度'])

In [None]:
print(new_train.columns)

In [None]:
new_train

In [None]:
X = new_train.drop(columns=['delayed'])  # 特征
y = new_train['delayed']  # 目标变量

# 将数据拆分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

print("训练集样本数：", len(X_train))
print("测试集样本数：", len(X_test))

In [None]:
# 假设Graphviz的可执行文件路径是 /path/to/graphviz/bin
graphviz_path = r'C:/_program/Graphviz2.38/bin'

# 设置环境变量
import os
os.environ['PATH'] += os.pathsep + graphviz_path

## 逻辑回归

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.metrics import recall_score, roc_auc_score
from imblearn.pipeline import Pipeline


# 创建一个包含SMOTE和逻辑回归的Pipeline
pipeline = Pipeline([
    ('smote', SMOTE(random_state=48)),
    ('logistic_regression', LogisticRegression(max_iter=1000, random_state=48))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Accuracy:", y_pred)
score = pipeline.score(X_test, y_test)
print("Score:", score)
recall = recall_score(y_test, y_pred, average='macro')
print('Recall:', recall)
print("roc_auc_score:",roc_auc_score(y_test, y_pred))

# 评估模型
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# 定义参数网格
param_grid = {
    'logistic_regression__C': [0.1, 1, 10],
    'logistic_regression__penalty': ['l1', 'l2']
}


grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='recall_macro')
grid_search.fit(X_train, y_train)

# 打印最佳参数和最佳分数
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


y_pred = grid_search.predict(X_test)

print("Accuracy:", y_pred)
score = grid_search.score(X_test, y_test)
print("Score:", score)
recall = recall_score(y_test, y_pred, average='macro')
print('Recall:', recall)
print("roc_auc_score:",roc_auc_score(y_test, y_pred))

# 评估模型
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
best_model = grid_search.best_estimator_

intercept_ = best_model.named_steps['logistic_regression'].intercept_
coef_ = best_model.named_steps['logistic_regression'].coef_

print("Intercept:", intercept_)
print("Coefficients:\n", coef_)

In [None]:
import joblib
joblib.dump(best_model,'model/logistic_regression.pkl')

## xgboost

In [None]:
from xgboost import XGBClassifier


pos_weight = sum(y_train == 1) / sum(y_train == 0)

# 初始化XGBClassifier模型，添加L1和L2正则化，并设置scale_pos_weight
XGBR_classifier = XGBClassifier(random_state=48, reg_alpha=0.2, reg_lambda=100.0, 
                                 scale_pos_weight=pos_weight)

# 训练模型
XGBR_classifier.fit(X_train, y_train)

In [None]:
import xgboost as xgb
import matplotlib.pyplot as plt

y_pred = XGBR_classifier.predict(X_test)
xgb.plot_importance(XGBR_classifier, importance_type='gain')
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
xgb.plot_tree(XGBR_classifier, num_trees=48)
plt.savefig("model/xgb.png", dpi=3000)  # 保存为 DPI 为 300 的图像
plt.close()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import f1_score
import matplotlib

In [None]:
XGBR_classifier.fit(X_train, y_train)
xgb_y_pred = XGBR_classifier.predict(X_test)
print('xgboost混淆矩阵:',confusion_matrix(y_test,xgb_y_pred))
print('xgboostf1得分:',f1_score(y_test,xgb_y_pred))
rf_accuracy = accuracy_score(y_test, xgb_y_pred)
print("xgboost准确率：", rf_accuracy)
xgb_recall = recall_score(y_test, xgb_y_pred, average='macro')
print("xgboost 召回率：", xgb_recall)
print("xgboost auc",roc_auc_score(y_test, y_pred))

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# 定义参数网格
param_grid = {
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1.0, 1.5],
    'scale_pos_weight': [1, 2, 5]  # 根据正负样本比例调整
}

# 初始化XGBClassifier
xgb = XGBClassifier(random_state=48)


grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# 打印最佳参数
print("Best parameters found: ", grid_search.best_params_)

# 使用最佳参数创建新的模型
best_xgb = XGBClassifier(**grid_search.best_params_, random_state=48)
best_xgb.fit(X_train, y_train)

In [None]:
xgb_y_pred = best_xgb.predict(X_test)
print('xgboost混淆矩阵:',confusion_matrix(y_test,xgb_y_pred))
print('xgboostf1得分:',f1_score(y_test,xgb_y_pred))
rf_accuracy = accuracy_score(y_test, xgb_y_pred)
print("xgboost准确率：", rf_accuracy)
xgb_recall = recall_score(y_test, xgb_y_pred, average='macro')
print("xgboost 召回率：", xgb_recall)
print("xgboost auc",roc_auc_score(y_test, y_pred))

## 决策树

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, roc_auc_score


class_weights = {0: sum(y_train == 0) / len(y_train), 1: sum(y_train == 1) / len(y_train)}
dtree = DecisionTreeClassifier(random_state=48, class_weight='balanced',min_samples_leaf = 1,min_samples_split=2,criterion='entropy')

# 训练模型
dtree.fit(X_train, y_train)

# 预测
dt_y_pred = dtree.predict(X_test)

# 打印评估指标
print('决策树混淆矩阵:', confusion_matrix(y_test, dt_y_pred))
print('决策树f1得分:', f1_score(y_test, dt_y_pred, average='macro'))
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print("决策树准确率：", dt_accuracy)
dt_recall = recall_score(y_test, dt_y_pred, average='macro')
print("决策树召回率：", dt_recall)

dt_y_pred_proba = dtree.predict_proba(X_test)[:, 1]
print("决策树 auc:", roc_auc_score(y_test, dt_y_pred_proba))

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier

# 定义超参数的搜索空间
param_grid = {
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

class_weights = {0: sum(y_train == 0) / len(y_train), 1: sum(y_train == 1) / len(y_train)}


dtree = DecisionTreeClassifier(random_state=48, class_weight='balanced')
grid_search = GridSearchCV(estimator=dtree, param_grid=param_grid, cv=5, scoring='f1_macro')


grid_search.fit(X_train, y_train)
print("最佳参数:", grid_search.best_params_)


best_dtree = grid_search.best_estimator
cv_scores = cross_val_score(best_dtree, X_train, y_train, cv=5, scoring='f1_macro')
print("交叉验证的F1分数:", cv_scores)

# 预测和评估
dt_y_pred = best_dtree.predict(X_test)
print('决策树混淆矩阵:', confusion_matrix(y_test, dt_y_pred))
print('决策树f1得分:', f1_score(y_test, dt_y_pred, average='macro'))
print("决策树准确率：", accuracy_score(y_test, dt_y_pred))
print("决策树召回率：", recall_score(y_test, dt_y_pred, average='macro'))

dt_y_pred_proba = best_dtree.predict_proba(X_test)[:, 1]
print("决策树 auc:", roc_auc_score(y_test, dt_y_pred_proba))

In [None]:
from sklearn import tree
import graphviz

# 假设 dtree 是已经训练好的决策树模型

# 导出决策树为 dot 格式
dot_data = tree.export_graphviz(dtree,
                                 feature_names=X_train.columns.tolist(),
                                 class_names=np.unique(y_train).astype(str).tolist(),
                                 filled=True, rounded=True,
                                 special_characters=True)

# 将 dot 数据写入文件
with open("tree.dot", "w") as f:
    f.write(dot_data)

# 使用 Graphviz 的 dot 命令行工具来生成图像
# 您可以在命令行中运行以下命令来生成高分辨率的图像
# 例如，生成 DPI 为 300 的 PNG 图像：
os.system('dot -Tpng -o output_highres.png tree.dot -Gdpi=1300')

## 应用

In [None]:
sample = pd.read_csv("flight-delays-fall-2018/sample_submission.csv.zip", compression='zip')
sample.head(900)

In [None]:
new_test1 = new_test.drop('dep_delayed_15min', axis=1)
predictions = best_model.predict_proba(new_test1)[:, 1]

submission = pd.DataFrame({'id':range(100000),'dep_delayed_15min':predictions})
submission.head(900)

In [None]:
filename = 'flight_delay.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)