In [123]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.decomposition import PCA
from xgboost import XGBClassifier

In [100]:
train = pd.read_csv("https://raw.githubusercontent.com/Yorko/mlcourse.ai/master/data/flight_delays_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/Yorko/mlcourse.ai/master/data/flight_delays_test.csv")

In [7]:
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [8]:
test.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


In [9]:
X_train, y_train = (
    train[["Distance", "DepTime"]].values,
    train["dep_delayed_15min"].map({"Y": 1, "N": 0}).values,
)
X_test = test[["Distance", "DepTime"]].values

X_train_part, X_valid, y_train_part, y_valid = train_test_split(
    X_train, y_train, test_size=0.3, random_state=17
)

In [10]:
logit = LogisticRegression(random_state=17)

logit.fit(X_train_part, y_train_part)
logit_valid_pred = logit.predict_proba(X_valid)[:, 1]

roc_auc_score(y_valid, logit_valid_pred)

0.6795697123357751

In [11]:
logit.fit(X_train, y_train)
logit_test_pred = logit.predict_proba(X_test)[:, 1]

pd.Series(logit_test_pred, name="dep_delayed_15min").to_csv(
    "logit_2feat.csv", index_label="id", header=True
)

# BenchMark

In [101]:
X_train, y_train = train.drop('dep_delayed_15min', axis=1), train.dep_delayed_15min.map({"Y": 1, "N": 0})
X_test = test

In [102]:
X_train['Route'] = X_train["Origin"] + '-' + X_train["Dest"]
X_test['Route'] = X_test["Origin"] + '-' + X_test["Dest"]

In [103]:
X_train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,Route
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,ATL-DFW
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,PIT-MCO
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,RDU-CLE
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,DEN-MEM
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,MDW-OMA


In [104]:
X_train.drop(['Origin', 'Dest'], axis=1, inplace=True)
X_test.drop(['Origin', 'Dest'], axis=1, inplace=True)

In [105]:
X_train.dtypes

Month            object
DayofMonth       object
DayOfWeek        object
DepTime           int64
UniqueCarrier    object
Distance          int64
Route            object
dtype: object

In [106]:
X_train_dummies = pd.get_dummies(X_train, columns=X_train.columns[X_train.dtypes == "object"])
X_test_dummies = pd.get_dummies(X_test, columns=X_test.columns[X_test.dtypes == "object"])

In [107]:
X_train_dummies.head()

Unnamed: 0,DepTime,Distance,Month_c-1,Month_c-10,Month_c-11,Month_c-12,Month_c-2,Month_c-3,Month_c-4,Month_c-5,...,Route_XNA-IAH,Route_XNA-LAX,Route_XNA-LGA,Route_XNA-ORD,Route_XNA-SLC,Route_YAK-CDV,Route_YAK-JNU,Route_YUM-IPL,Route_YUM-LAX,Route_YUM-PHX
0,1934,732,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1548,834,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1422,416,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1015,872,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1828,423,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
X_test_dummies.head()

Unnamed: 0,DepTime,Distance,Month_c-1,Month_c-10,Month_c-11,Month_c-12,Month_c-2,Month_c-3,Month_c-4,Month_c-5,...,Route_XNA-MSP,Route_XNA-ORD,Route_XNA-SLC,Route_YAK-CDV,Route_YAK-JNU,Route_YKM-SLC,Route_YUM-IPL,Route_YUM-LAS,Route_YUM-LAX,Route_YUM-PHX
0,615,598,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,739,1235,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,651,577,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1614,377,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1505,258,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [109]:
miscols = []
for col in X_test_dummies.columns:
    if col not in X_train_dummies.columns:
        miscols.append(col)

X_test_dummies.drop(miscols, axis=1, inplace=True)
miscols.clear()

for col in X_train_dummies.columns:
    if col not in X_test_dummies.columns:
        miscols.append(col)
X_train_dummies.drop(miscols, axis=1, inplace=True)

In [112]:
allColumns = X_train_dummies.columns

In [113]:
sc = StandardScaler()
X_test_dummies = sc.fit_transform(X_test_dummies)
X_train_dummies = sc.transform(X_train_dummies)

In [115]:
X_test_dummies = pd.DataFrame(X_test_dummies, columns=allColumns)
X_train_dummies = pd.DataFrame(X_train_dummies, columns=allColumns)

In [126]:
pca = PCA(n_components=0.9)
pca.fit(X_train_dummies)

MemoryError: Unable to allocate 3.07 GiB for an array with shape (100000, 4121) and data type float64

In [116]:
X_train_part, X_valid, y_train_part, y_valid = train_test_split(
    X_train_dummies, y_train, test_size=0.3, random_state=17
)

In [48]:
logit = LogisticRegression(C = 0.01, solver = 'saga', random_state=17)

In [122]:
logit.fit(X_train_part, y_train_part)

MemoryError: Unable to allocate 2.15 GiB for an array with shape (70000, 4121) and data type float64