In [169]:
import pandas as pd
import numpy as np # Linear Algebra
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score,cross_val_predict # Cross Validation
from sklearn.preprocessing import OneHotEncoder # Perform OneHotEnconding
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt # Data Viz
import lightgbm as lgb

In [150]:
dataset = pd.read_csv("/Users/yanli/Downloads/Jan_2019_ontime.csv")

In [151]:
data = dataset.drop(['OP_UNIQUE_CARRIER','OP_CARRIER_AIRLINE_ID','OP_CARRIER','TAIL_NUM', 'ORIGIN_AIRPORT_ID','ORIGIN_AIRPORT_SEQ_ID','DEST_AIRPORT_ID','DEST_AIRPORT_SEQ_ID','Unnamed: 21'], axis=1)
#data = data.set_index('OP_CARRIER_FL_NUM')
data.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_FL_NUM,ORIGIN,DEST,DEP_TIME,DEP_DEL15,DEP_TIME_BLK,ARR_TIME,ARR_DEL15,CANCELLED,DIVERTED,DISTANCE
0,1,2,3280,GNV,ATL,601.0,0.0,0600-0659,722.0,0.0,0.0,0.0,300.0
1,1,2,3281,MSP,CVG,1359.0,0.0,1400-1459,1633.0,0.0,0.0,0.0,596.0
2,1,2,3282,DTW,CVG,1215.0,0.0,1200-1259,1329.0,0.0,0.0,0.0,229.0
3,1,2,3283,TLH,ATL,1521.0,0.0,1500-1559,1625.0,0.0,0.0,0.0,223.0
4,1,2,3284,ATL,FSM,1847.0,0.0,1900-1959,1940.0,0.0,0.0,0.0,579.0


In [154]:

pd.DataFrame({'unicos':data.nunique(),
              'missing': data.isna().sum()/data.count(),
              'tipo':data.dtypes})

Unnamed: 0,unicos,missing,tipo
DAY_OF_MONTH,31,0.0,int64
DAY_OF_WEEK,7,0.0,int64
OP_CARRIER_FL_NUM,6839,0.0,int64
ORIGIN,346,0.0,object
DEST,346,0.0,object
DEP_TIME,1439,0.0,float64
DEP_DEL15,2,0.0,float64
DEP_TIME_BLK,19,0.0,object
ARR_TIME,1441,0.0,float64
ARR_DEL15,2,0.0,float64


In [153]:
data.ARR_TIME.fillna(0, inplace=True)
data.DEP_TIME.fillna(0, inplace=True)
data.DEP_DEL15.fillna(1, inplace=True)
data.ARR_DEL15.fillna(1, inplace=True)

In [177]:
t_data = data.copy()
#Transformation of data types
colunas = ['DAY_OF_WEEK','DAY_OF_MONTH','ORIGIN', 'DEST', 'DEP_TIME_BLK', 'DEP_DEL15','ARR_DEL15','CANCELLED','DIVERTED']
for col in colunas:
  t_data[col] = t_data[col].astype('category') 
X = t_data.drop(columns=['ARR_TIME'])
y = t_data[["ARR_TIME"]]

In [156]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [157]:
X_train.head(5)

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_FL_NUM,ORIGIN,DEST,DEP_TIME,DEP_DEL15,DEP_TIME_BLK,ARR_DEL15,CANCELLED,DIVERTED,DISTANCE
68992,4,5,1854,ORD,PHL,654.0,0.0,0700-0759,0.0,0.0,0.0,678.0
256311,14,1,5366,ONT,SFO,1917.0,0.0,1900-1959,0.0,0.0,0.0,363.0
483968,26,6,3666,JFK,CLE,1541.0,0.0,1500-1559,0.0,0.0,0.0,425.0
67415,4,5,1616,BUF,BOS,2101.0,0.0,2100-2159,0.0,0.0,0.0,395.0
93603,5,6,5885,PHX,FAT,1752.0,0.0,1700-1759,1.0,0.0,0.0,493.0


In [166]:
model = lgb.LGBMRegressor(num_leaves=63,
                         learning_rate=0.1,
                         n_estimators=40)


In [167]:
model.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1',
        callbacks=[lgb.early_stopping(5)])



Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[40]	valid_0's l1: 67.2327	valid_0's l2: 32073.4


LGBMRegressor(n_estimators=40, num_leaves=63)

In [168]:
print('Starting predicting...')
# predict
y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
# eval
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f'The RMSE of prediction is: {rmse_test}')

# feature importances
print(f'Feature importances: {list(model.feature_importances_)}')

Starting predicting...
The RMSE of prediction is: 179.0903722825794
Feature importances: [31, 0, 13, 457, 548, 633, 23, 57, 73, 81, 49, 515]


In [165]:
# other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31)

param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40]
}

gbm = GridSearchCV(estimator, param_grid, cv=3)
gbm.fit(X_train, y_train)

print(f'Best parameters found by grid search are: {gbm.best_params_}')

Best parameters found by grid search are: {'learning_rate': 0.1, 'n_estimators': 40}


In [146]:
cat_vars_final = t_data.select_dtypes(['object','category'])
cat_vars_final.head(5)

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,ORIGIN,DEST,DEP_DEL15,DEP_TIME_BLK,ARR_DEL15,CANCELLED,DIVERTED,DISTANCE_cat
0,1,2,GNV,ATL,0.0,0600-0659,0.0,0.0,0.0,"(30.999, 363.0]"
1,1,2,MSP,CVG,0.0,1400-1459,0.0,0.0,0.0,"(363.0, 640.0]"
2,1,2,DTW,CVG,0.0,1200-1259,0.0,0.0,0.0,"(30.999, 363.0]"
3,1,2,TLH,ATL,0.0,1500-1559,0.0,0.0,0.0,"(30.999, 363.0]"
4,1,2,ATL,FSM,0.0,1900-1959,0.0,0.0,0.0,"(363.0, 640.0]"


In [147]:
#One Hot Encoder

enc = OneHotEncoder().fit(cat_vars_final)

cat_vars_ohe_final = enc.transform(cat_vars_final).toarray()
cat_vars_ohe_final = pd.DataFrame(cat_vars_ohe_final, index= cat_vars_final.index, 
                      columns=enc.get_feature_names(cat_vars_final.columns.tolist()))

In [178]:
#Instantizing Model
lr_model_final = LinearRegression.fit(X, y)

#training
lr_model_final.score(X,y)

TypeError: fit() missing 1 required positional argument: 'y'

In [6]:
# Data wrangling 
import pandas as pd 

# Array math
import numpy as np 

# Quick value count calculator
from collections import Counter


class Node: 
    """
    Class for creating the nodes for a decision tree 
    """
    def __init__(
        self, 
        Y: list,
        X: pd.DataFrame,
        min_samples_split=None,
        max_depth=None,
        depth=None,
        node_type=None,
        rule=None
    ):
        # Saving the data to the node 
        self.Y = Y 
        self.X = X

        # Saving the hyper parameters
        self.min_samples_split = min_samples_split if min_samples_split else 20
        self.max_depth = max_depth if max_depth else 5

        # Default current depth of node 
        self.depth = depth if depth else 0

        # Extracting all the features
        self.features = list(self.X.columns)

        # Type of node 
        self.node_type = node_type if node_type else 'root'

        # Rule for spliting 
        self.rule = rule if rule else ""

        # Calculating the counts of Y in the node 
        self.counts = Counter(Y)

        # Getting the GINI impurity based on the Y distribution
        self.gini_impurity = self.get_GINI()

        # Sorting the counts and saving the final prediction of the node 
        counts_sorted = list(sorted(self.counts.items(), key=lambda item: item[1]))

        # Getting the last item
        yhat = None
        if len(counts_sorted) > 0:
            yhat = counts_sorted[-1][0]

        # Saving to object attribute. This node will predict the class with the most frequent class
        self.yhat = yhat 

        # Saving the number of observations in the node 
        self.n = len(Y)

        # Initiating the left and right nodes as empty nodes
        self.left = None 
        self.right = None 

        # Default values for splits
        self.best_feature = None 
        self.best_value = None 

    @staticmethod
    def GINI_impurity(y1_count: int, y2_count: int) -> float:
        """
        Given the observations of a binary class calculate the GINI impurity
        """
        # Ensuring the correct types
        if y1_count is None:
            y1_count = 0

        if y2_count is None:
            y2_count = 0

        # Getting the total observations
        n = y1_count + y2_count
        
        # If n is 0 then we return the lowest possible gini impurity
        if n == 0:
            return 0.0

        # Getting the probability to see each of the classes
        p1 = y1_count / n
        p2 = y2_count / n
        
        # Calculating GINI 
        gini = 1 - (p1 ** 2 + p2 ** 2)
        
        # Returning the gini impurity
        return gini

    @staticmethod
    def ma(x: np.array, window: int) -> np.array:
        """
        Calculates the moving average of the given list. 
        """
        return np.convolve(x, np.ones(window), 'valid') / window

    def get_GINI(self):
        """
        Function to calculate the GINI impurity of a node 
        """
        # Getting the 0 and 1 counts
        y1_count, y2_count = self.counts.get(0, 0), self.counts.get(1, 0)

        # Getting the GINI impurity
        return self.GINI_impurity(y1_count, y2_count)

    def best_split(self) -> tuple:
        """
        Given the X features and Y targets calculates the best split 
        for a decision tree
        """
        # Creating a dataset for spliting
        df = self.X.copy()
        df['Y'] = self.Y

        # Getting the GINI impurity for the base input 
        GINI_base = self.get_GINI()

        # Finding which split yields the best GINI gain 
        max_gain = 0

        # Default best feature and split
        best_feature = None
        best_value = None

        for feature in self.features:
            print(feature)
            # Droping missing values
            Xdf = df.dropna().sort_values(feature)

            # Sorting the values and getting the rolling average
            xmeans = self.ma(Xdf[feature].unique(), 2)

            for value in xmeans:
                # Spliting the dataset 
                left_counts = Counter(Xdf[Xdf[feature]<value]['Y'])
                right_counts = Counter(Xdf[Xdf[feature]>=value]['Y'])

                # Getting the Y distribution from the dicts
                y0_left, y1_left, y0_right, y1_right = left_counts.get(0, 0), left_counts.get(1, 0), right_counts.get(0, 0), right_counts.get(1, 0)

                # Getting the left and right gini impurities
                gini_left = self.GINI_impurity(y0_left, y1_left)
                gini_right = self.GINI_impurity(y0_right, y1_right)

                # Getting the obs count from the left and the right data splits
                n_left = y0_left + y1_left
                n_right = y0_right + y1_right

                # Calculating the weights for each of the nodes
                w_left = n_left / (n_left + n_right)
                w_right = n_right / (n_left + n_right)

                # Calculating the weighted GINI impurity
                wGINI = w_left * gini_left + w_right * gini_right

                # Calculating the GINI gain 
                GINIgain = GINI_base - wGINI

                # Checking if this is the best split so far 
                if GINIgain > max_gain:
                    best_feature = feature
                    best_value = value 

                    # Setting the best gain to the current one 
                    max_gain = GINIgain

        return (best_feature, best_value)

    def grow_tree(self):
        """
        Recursive method to create the decision tree
        """
        # Making a df from the data 
        df = self.X.copy()
        df['Y'] = self.Y

        # If there is GINI to be gained, we split further 
        if (self.depth < self.max_depth) and (self.n >= self.min_samples_split):

            # Getting the best split 
            best_feature, best_value = self.best_split()

            if best_feature is not None:
                # Saving the best split to the current node 
                self.best_feature = best_feature
                self.best_value = best_value

                # Getting the left and right nodes
                left_df, right_df = df[df[best_feature]<=best_value].copy(), df[df[best_feature]>best_value].copy()

                # Creating the left and right nodes
                left = Node(
                    left_df['Y'].values.tolist(), 
                    left_df[self.features], 
                    depth=self.depth + 1, 
                    max_depth=self.max_depth, 
                    min_samples_split=self.min_samples_split, 
                    node_type='left_node',
                    rule=f"{best_feature} <= {round(best_value, 3)}"
                    )

                self.left = left 
                self.left.grow_tree()

                right = Node(
                    right_df['Y'].values.tolist(), 
                    right_df[self.features], 
                    depth=self.depth + 1, 
                    max_depth=self.max_depth, 
                    min_samples_split=self.min_samples_split,
                    node_type='right_node',
                    rule=f"{best_feature} > {round(best_value, 3)}"
                    )

                self.right = right
                self.right.grow_tree()

   


In [17]:
arr = [[1,2,1],[0,1,0]]
a = np.array(arr)
f = a[:,-1]
b = np.delete(a, -1, axis=1)
df = pd.DataFrame(arr)
for col in df:
    df[col].replace(0, df[col].mean(),inplace=True)
print(df)
for col in df:
    df[col] = (df[col]-df[col].mean())/df[col].std()
print(df)

     0  1    2
0  1.0  2  1.0
1  0.5  1  0.5
          0         1         2
0  0.707107  0.707107  0.707107
1 -0.707107 -0.707107 -0.707107


In [None]:
#Step 1 define distance metric
def euc_dist(value1, value2):
    # implement this
    summ = 0
    for i in range(len(value1)):
        #print(value1,value2)
        summ += (value1[i]-value2[i])**2
    return summ**(0.5)
   
#Step 2 Identify k nearest neighbors
def k_neighbors(train_data, test_case, k):
    # implement this
    dis_tc = []
    for i, td in enumerate(train_data):
        dist_ = euc_dist(td[:-1],test_case)
        dis_tc.append([dist_, td])
    output_ = sorted(dis_tc)
    out=[]
    for j in range(k):
        out.append(output_[j][1])
    #print(output_,'***************',out)
    return out
#Step 3 Assign class labels
def get_label(train_data, test_case, k):
    neighbors = k_neighbors(train_data, test_case, k)
    labels = [row[-1] for row in neighbors]
    max_label = max(set(labels), key = labels.count)
   
    return max_label
#Pull it together
def solution(train_data, test_data, k):
    final_labels = list()
    for row in test_data:
        label = get_label(train_data, row, k)
        final_labels.append(label)
    return final_labels

In [None]:
import random
from scipy.spatial.distance import cdist
def calculate_distances(data, centroids):
    """
    Step 1: Calculate distance between each data point and the k centroids
    """
    return cdist(data, centroids, "cosine").tolist()
def make_clusters(distances):
    """
    Step 2: Assign each data point to it's nearest centroid
    """
    # implement this
    ...
def update_clusters(clusters, data, k, iterations):
    """
    Step 3: Average the data points in each cluster to update
    the centroids' locations and repeat for set number of iterations
    """
    # implement this
    ...
# pull everything together
def solution(d‍‍‌‍‍‍‍‍‍‌‍‌‍‍‌‍‌‌‌‍ata, k, centroids, iterations):
    distances = calculate_distances(data, centroids)
    clusters = make_clusters(distances)
    clusters = update_clusters(clusters, data, k, iterations)
    return clusters

In [None]:
class Solution:
    def minimizeResult(self, expression: str) -> str:
        first, second = expression.split("+")
        score = float('inf')
        for i in range(len(first)):
            num1 = int(first[:i]) if i != 0 else 1
            num2 = int(first[i:])
            for j in range(1, len(second)+1):
                num3 = int(second[:j])
                num4 = int(second[j:]) if j != len(second) else 1
                tem_score = num1*(num2+num3)*num4
                if tem_score < score:
                    score = tem_score
        return score