# GRADIENT BOOSTING REGRESSOR MODEL

# Modules used

In [177]:
import pandas as pd
import numpy as np

from haversine import haversine
from astral.sun import sun
from astral import LocationInfo
from sklearn.preprocessing import PolynomialFeatures
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import GridSearchCV

# Loading data

In [178]:
df = pd.read_csv("/Users/Tomasss/uni/23_24Master/Modelos toma de decisión/entrega 1/maas-demand-in-madrid/data/data_mtd.csv")

From the previous model we already know that there are no null values so we can go on with removing outliers and feature engineering

# Checking data

## Outliers

### Coordinates outside Madrid

In [179]:
df['distancia'] = df.apply(lambda row: haversine((row['lat_ini'], row['long_ini']),
                                                (row['lat_fin'], row['long_fin'])),axis=1)
# def clean_coordinates(df):
#    nrows = df.shape[0]
#    df.drop(df.index[
#        
#            ~((df['lat_ini'].between(40.496115395170364, 40.91553277700258)) &
#              (df['long_ini'].between(-74.25559136315209, -73.7000090639354))) 
#        
#    ], inplace=True)
#    print("Number of rows removed due to wrong coordinates is {}".format(nrows - df.shape[0]))
#    
#clean_coordinates(df)

### Trip time below a minute and over two hours

In [180]:
def clean_trip_duration(df):
    # convert from object to datetime
    df['inicio_viaje']  = pd.to_datetime(df['inicio_viaje'])
    df['final_viaje']  = pd.to_datetime(df['final_viaje'])
    
    # copute the time diffrance between pickup & dropoff
    # to covert from nanosecondes to minutes we devide by 1000000000 then by 60
    # store trip_duratin column
    trip_duration = np.array(df['final_viaje']-df['inicio_viaje'])
    trip_duration = trip_duration/1000000000/60
    df['trip_duration'] = trip_duration.astype(float)
    
    # drop all records that have trip_duration > 2 hours
    #                            trip_duration <= 0
    #                            trip_distance <= 0
    nrows = df.shape[0]
    df.drop(df[(df['trip_duration'] > 160) | 
               (df['trip_duration'] <= 0)].index, inplace = True)
    print("Number of rows removed due to wrong trip_duration {}".format(nrows - df.shape[0]))
    
    
clean_trip_duration(df)

Number of rows removed due to wrong trip_duration 383


### Null or excesive distance

In [181]:
def clean_trip_distance(df):
    nrows = df.shape[0]
    df.drop(df[(df['distancia'] <= 0) | (df['distancia'] > 77.5)].index, inplace = True)
    print("Number of rows removed due to speed outliers {}".format(nrows - df.shape[0]))
    
clean_trip_distance(df)

Number of rows removed due to speed outliers 722


### Null or excesive velocity

In [182]:
def compute_speed(df):
    # computing Taxi speed average (mile/hour)
    df['speed'] = df['distancia']/df['trip_duration']*60
    
def clean_speed(df):

    # Removing speed anomaly/outliers
    nrows = df.shape[0]
    df.drop(df[((df['speed'] <= 0) | (df['speed'] > 63.0))].index, inplace = True)
    print("Number of rows removed due to speed outliers {}".format(nrows - df.shape[0]))


compute_speed(df)
clean_speed(df)

Number of rows removed due to speed outliers 89


# Feature engineering (train and real prediction data)

In [183]:
df['date2'] = pd.to_datetime(df['inicio_viaje'])


df['date'] = df['date2'].dt.date
df['hour'] = pd.DatetimeIndex(df['date2']).hour.astype(int)
#num_viajes['hour'] = num_viajes['hour'].astype(int)
df['mes'] = df['date2'].dt.month

def get_season(mes):
    if 3 <= mes <= 5:
        return '1'  # Primavera
    elif 6 <= mes <= 8:
        return '2'  # Verano
    elif 9 <= mes <= 11:
        return '3'  # Otoño
    else:
        return '4'  # Invierno

df['season'] = df['mes'].apply(get_season)

In [184]:
num_viajes=df.groupby(['cluster', 'hour','date','mes'])['cluster'].count().to_frame(name = 'trips').reset_index()

In [185]:
#num_viajes['hour'] = num_viajes['hour'].astype(int)

##  Feature engineering with real prediction data

In [186]:
test = pd.read_csv("/Users/Tomasss/uni/23_24Master/Modelos toma de decisión/entrega 1/maas-demand-in-madrid/data/sample_submission.csv")

In [187]:
split_test= test["Id"].str.split("_", n = 3, expand = True)

In [188]:
test['cluster'] = split_test[0]
test['date'] = split_test[1]
test['hour'] = split_test[2]

test['date'] = pd.to_datetime(test['date'])

test['dia_semana'] = test['date'].dt.dayofweek
test['mes'] = test['date'].dt.month
test['season'] = test['mes'].apply(get_season)

test = test.drop(columns=['trips'])

Weather data (exogenus features)

In [189]:
weather = pd.read_csv('data/export_weather.csv')

In [190]:
horas = list(range(24))

weather_expandido = pd.DataFrame()
for _, fila in weather.iterrows():
    fila_repetida = pd.DataFrame([fila] * 24)
    fila_repetida['hour'] = horas
    weather_expandido = pd.concat([weather_expandido, fila_repetida], ignore_index=True)

In [191]:
new_row = weather_expandido.iloc[-1].copy()
new_row['date'] = '2023-01-01'
new_row_df = pd.DataFrame([new_row])

weather_expandido = pd.concat([weather_expandido, new_row_df],ignore_index = True)
weather_expandido['hour'].iloc[8760] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_expandido['hour'].iloc[8760] = 0


In [192]:
weather_expandido['date'] = pd.to_datetime(weather_expandido['date'])

In [193]:
weather_train = weather_expandido[weather_expandido['date'] <= '2022-09-30']
weather_test = weather_expandido[weather_expandido['date'] >= '2022-09-30']

In [194]:
num_viajes['date'] = pd.to_datetime(num_viajes['date'])

## Merging with weather data

In [195]:
num_viajes_weather = pd.merge(num_viajes,weather_train, on = ['date','hour'])
num_viajes_weather.sort_values(['date','hour'])

Unnamed: 0,cluster,hour,date,mes,trips,tavg,tmin,tmax,prcp,wdir,wspd,pres
362052,3,0,2022-01-01,1,1,-0.794859,-0.940391,-0.580302,-0.341636,-1.192860,-1.129823,1.649661
362053,5,0,2022-01-01,1,1,-0.794859,-0.940391,-0.580302,-0.341636,-1.192860,-1.129823,1.649661
362054,7,0,2022-01-01,1,1,-0.794859,-0.940391,-0.580302,-0.341636,-1.192860,-1.129823,1.649661
362055,8,0,2022-01-01,1,2,-0.794859,-0.940391,-0.580302,-0.341636,-1.192860,-1.129823,1.649661
362056,10,0,2022-01-01,1,1,-0.794859,-0.940391,-0.580302,-0.341636,-1.192860,-1.129823,1.649661
...,...,...,...,...,...,...,...,...,...,...,...,...
292460,95,21,2022-09-30,9,1,-0.285740,-0.388110,-0.245859,-0.315915,2.005748,-0.126131,-0.466580
292461,97,21,2022-09-30,9,5,-0.285740,-0.388110,-0.245859,-0.315915,2.005748,-0.126131,-0.466580
292462,98,21,2022-09-30,9,9,-0.285740,-0.388110,-0.245859,-0.315915,2.005748,-0.126131,-0.466580
292463,99,21,2022-09-30,9,4,-0.285740,-0.388110,-0.245859,-0.315915,2.005748,-0.126131,-0.466580


In [196]:
test['hour'] = test['hour'].astype(int)
test['date'] = pd.to_datetime(test['date'])

In [197]:
test = pd.merge(test,weather_test, on = ['date','hour'])

## More feature engineering

Bank holidays

In [198]:
madrid_holidays = ['2022-01-01', '2022-01-06', '2022-04-15', '2022-05-01', '2022-08-15', '2022-10-12', '2022-11-01', '2022-12-06', '2022-12-25']
num_viajes_weather['holiday'] = 0
num_viajes_weather.loc[num_viajes_weather['date'].isin(madrid_holidays), 'holiday'] = 1

Week of the year and day of the week

In [199]:
num_viajes_weather['date'] = pd.to_datetime(num_viajes_weather['date'])

In [200]:
num_viajes_weather['week_of_year'] = num_viajes_weather['date'].dt.isocalendar().week
num_viajes_weather['week_day'] = num_viajes_weather['date'].dt.day_of_week + 1

Sunlight data

In [201]:
# Sunlight features
# ==============================================================================
location = LocationInfo(
    name='Madrid',
    region='Spain',
    timezone='Europe/Madrid',
    latitude=40.4165000,
    longitude=-3.7025600)

sunrise_hour = [
    sun(location.observer, date=date, tzinfo=location.timezone)['sunrise'].hour
    for date in num_viajes_weather['date']]

sunset_hour = [
    sun(location.observer, date=date, tzinfo=location.timezone)['sunset'].hour
    for date in num_viajes_weather['date']]

sun_light_features = pd.DataFrame({
                         'sunrise_hour': sunrise_hour,
                         'sunset_hour': sunset_hour}, 
                         index = num_viajes_weather.index)

sun_light_features['daylight_hours'] = (
    sun_light_features['sunset_hour'] - sun_light_features['sunrise_hour'])

sun_light_features['is_daylight'] = np.where(
                                        (num_viajes_weather['hour'] >= sun_light_features['sunrise_hour']) & \
                                        (num_viajes_weather['hour'] < sun_light_features['sunset_hour']),
                                        1,
                                        0)

num_viajes_weather = pd.concat([
                            num_viajes_weather,
                            sun_light_features,
                        ], axis=1)

Cyclical values encoding

In [205]:
# Cliclical encoding of calendar and sunlight features
# ==============================================================================
def cyclical_encoding(data: pd.Series, cycle_length: int) -> pd.DataFrame:
    """
    Encode a cyclical feature with two new features sine and cosine.
    The minimum value of the feature is assumed to be 0. The maximum value
    of the feature is passed as an argument.
      
    Parameters
    ----------
    data : pd.Series
        Series with the feature to encode.
    cycle_length : int
        The length of the cycle. For example, 12 for months, 24 for hours, etc.
        This value is used to calculate the angle of the sin and cos.

    Returns
    -------
    result : pd.DataFrame
        Dataframe with the two new features sin and cos.

    """

    sin = np.sin(2 * np.pi * data/cycle_length)
    cos = np.cos(2 * np.pi * data/cycle_length)
    result =  pd.DataFrame({
                  f"{data.name}_sin": sin,
                  f"{data.name}_cos": cos
              })

    return result


month_encoded = cyclical_encoding(num_viajes_weather['mes'], cycle_length=12)
week_of_year_encoded = cyclical_encoding(num_viajes_weather['week_of_year'], cycle_length=52)
week_day_encoded = cyclical_encoding(num_viajes_weather['week_day'], cycle_length=7)
hour_day_encoded = cyclical_encoding(num_viajes_weather['hour'], cycle_length=24)
sunrise_hour_encoded = cyclical_encoding(num_viajes_weather['sunrise_hour'], cycle_length=24)
sunset_hour_encoded = cyclical_encoding(num_viajes_weather['sunset_hour'], cycle_length=24)

cyclical_features = pd.concat([
                        month_encoded,
                        week_of_year_encoded,
                        week_day_encoded,
                        sunrise_hour_encoded,
                        sunset_hour_encoded,
                        hour_day_encoded
                    ], axis=1)

num_viajes_weather = pd.concat([
                            num_viajes_weather,
                            cyclical_features,
                        ], axis=1)

Feature interaction

In [206]:
# Interaction between exogenous variables
# ==============================================================================
transformer_poly = PolynomialFeatures(
                       degree           = 2,
                       interaction_only = True,
                       include_bias     = False
                   ).set_output(transform="pandas")

poly_cols = ['cluster', 
             'hour', 
             'tmin', 
             'pres',
             'holiday',
             'sunrise_hour', 
             'sunset_hour',
             'mes_sin', 
             'mes_cos', 
             'week_of_year_sin',
             'week_of_year_cos', 
             'week_day_sin', 
             'week_day_cos', 
             'sunrise_hour_sin',
             'sunrise_hour_cos', 
             'sunset_hour_sin', 
             'sunset_hour_cos', 
             'hour_sin',
             'hour_cos']

 
poly_features = transformer_poly.fit_transform(num_viajes_weather[poly_cols].dropna())
poly_features = poly_features.drop(columns=poly_cols)
poly_features.columns = [f"poly_{col}" for col in poly_features.columns]
poly_features.columns = poly_features.columns.str.replace(" ", "__")
num_viajes_weather = pd.concat([num_viajes_weather, poly_features], axis=1)

The same for the real prediction data

In [207]:
madrid_holidays = ['2022-01-01', '2022-01-06', '2022-04-15', '2022-05-01', '2022-08-15', '2022-10-12', '2022-11-01', '2022-12-06', '2022-12-25']
test['holiday'] = 0
test.loc[test['date'].isin(madrid_holidays), 'holiday'] = 1

In [208]:
test['date'] = pd.to_datetime(test['date'])

In [209]:
test['week_of_year'] = test['date'].dt.isocalendar().week
test['week_day'] = test['date'].dt.day_of_week + 1

In [210]:
# Sunlight features
# ==============================================================================
location = LocationInfo(
    name='Madrid',
    region='Spain',
    timezone='Europe/Madrid',
    latitude=40.4165000,
    longitude=-3.7025600)

sunrise_hour = [
    sun(location.observer, date=date, tzinfo=location.timezone)['sunrise'].hour
    for date in test['date']]

sunset_hour = [
    sun(location.observer, date=date, tzinfo=location.timezone)['sunset'].hour
    for date in test['date']]

sun_light_features = pd.DataFrame({
                         'sunrise_hour': sunrise_hour,
                         'sunset_hour': sunset_hour}, 
                         index = test.index)

sun_light_features['daylight_hours'] = (
    sun_light_features['sunset_hour'] - sun_light_features['sunrise_hour'])

sun_light_features['is_daylight'] = np.where(
                                        (test['hour'] >= sun_light_features['sunrise_hour']) & \
                                        (test['hour'] < sun_light_features['sunset_hour']),
                                        1,
                                        0)

test = pd.concat([
                            test,
                            sun_light_features,
                        ], axis=1)

In [211]:
test['mes'] = test['mes'].astype(int)

In [212]:
# Cliclical encoding of calendar and sunlight features
# ==============================================================================
def cyclical_encoding(data: pd.Series, cycle_length: int) -> pd.DataFrame:
    """
    Encode a cyclical feature with two new features sine and cosine.
    The minimum value of the feature is assumed to be 0. The maximum value
    of the feature is passed as an argument.
      
    Parameters
    ----------
    data : pd.Series
        Series with the feature to encode.
    cycle_length : int
        The length of the cycle. For example, 12 for months, 24 for hours, etc.
        This value is used to calculate the angle of the sin and cos.

    Returns
    -------
    result : pd.DataFrame
        Dataframe with the two new features sin and cos.

    """

    sin = np.sin(2 * np.pi * data/cycle_length)
    cos = np.cos(2 * np.pi * data/cycle_length)
    result =  pd.DataFrame({
                  f"{data.name}_sin": sin,
                  f"{data.name}_cos": cos
              })

    return result


month_encoded = cyclical_encoding(test['mes'], cycle_length=12)
week_of_year_encoded = cyclical_encoding(test['week_of_year'], cycle_length=52)
week_day_encoded = cyclical_encoding(test['week_day'], cycle_length=7)
hour_day_encoded = cyclical_encoding(test['hour'], cycle_length=24)
sunrise_hour_encoded = cyclical_encoding(test['sunrise_hour'], cycle_length=24)
sunset_hour_encoded = cyclical_encoding(test['sunset_hour'], cycle_length=24)

cyclical_features = pd.concat([
                        month_encoded,
                        week_of_year_encoded,
                        week_day_encoded,
                        sunrise_hour_encoded,
                        sunset_hour_encoded,
                        hour_day_encoded
                    ], axis=1)

test = pd.concat([
                            test,
                            cyclical_features,
                        ], axis=1)

In [213]:
# Interaction between exogenous variables
# ==============================================================================
transformer_poly = PolynomialFeatures(
                       degree           = 2,
                       interaction_only = True,
                       include_bias     = False
                   ).set_output(transform="pandas")

poly_cols = ['cluster', 
             'hour',  
             'tmin', 
             'pres',
             'holiday',
             'sunrise_hour', 
             'sunset_hour',
             'mes_sin', 
             'mes_cos', 
             'week_of_year_sin',
             'week_of_year_cos', 
             'week_day_sin', 
             'week_day_cos', 
             'sunrise_hour_sin',
             'sunrise_hour_cos', 
             'sunset_hour_sin', 
             'sunset_hour_cos', 
             'hour_sin',
             'hour_cos']

 
poly_features = transformer_poly.fit_transform(test[poly_cols].dropna())
poly_features = poly_features.drop(columns=poly_cols)
poly_features.columns = [f"poly_{col}" for col in poly_features.columns]
poly_features.columns = poly_features.columns.str.replace(" ", "__")
test = pd.concat([test, poly_features], axis=1)

In [214]:
num_viajes_weather['hour'] = num_viajes_weather['hour'].astype(str)
num_viajes_weather['week_day'] = num_viajes_weather['week_day'].astype(str)
num_viajes_weather['week_of_year'] = num_viajes_weather['week_of_year'].astype(str)
num_viajes_weather['mes'] = num_viajes_weather['mes'].astype(str)

# Splitting train and test

Turning everything to string type for better predictions

In [215]:
num_viajes_features = num_viajes_weather[['cluster', 
                                          'hour',
                                          'week_day'
                                          ]]
num_viajes_labels = num_viajes_weather['trips']

In [216]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(num_viajes_features, num_viajes_labels, test_size = 0.15, random_state = 42)

In [217]:
train_features

Unnamed: 0,cluster,hour,week_day
103470,37,12,6
11928,86,1,6
318923,43,23,7
26903,73,6,2
7023,3,0,5
...,...,...,...
259178,45,20,7
365838,22,2,1
131932,61,13,5
146867,79,14,7


# Model

In [218]:
model = GradientBoostingRegressor(loss="absolute_error", n_estimators=100, learning_rate=0.25, max_depth=10,random_state=42)

model.fit(train_features, train_labels)

In [219]:
y_pred = model.predict(test_features)

In [220]:
# Calculate the absolute errors
errors = abs(y_pred - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', (np.mean(errors)))

Mean Absolute Error: 1.2495559316611264


# Real prediction

In [221]:
num_viajes_weather.sort_values('date')

Unnamed: 0,cluster,hour,date,mes,trips,tavg,tmin,tmax,prcp,wdir,...,poly_sunset_hour_cos__hour_sin,poly_sunset_hour_cos__hour_sin.1,poly_sunset_hour_cos__hour_cos,poly_sunset_hour_cos__hour_cos.1,poly_hour_sin__hour_sin,poly_hour_sin__hour_cos,poly_hour_sin__hour_cos.1,poly_hour_sin__hour_cos.2,poly_hour_sin__hour_cos.3,poly_hour_cos__hour_cos
378838,61,16,2022-01-01,1,3,-0.794859,-0.940391,-0.580302,-0.341636,-1.192860,...,2.241439e-01,2.241439e-01,0.129410,0.129410,7.500000e-01,4.330127e-01,4.330127e-01,4.330127e-01,4.330127e-01,0.250000
352805,68,17,2022-01-01,1,5,-0.794859,-0.940391,-0.580302,-0.341636,-1.192860,...,2.500000e-01,2.500000e-01,0.066987,0.066987,9.330127e-01,2.500000e-01,2.500000e-01,2.500000e-01,2.500000e-01,0.066987
352804,66,17,2022-01-01,1,2,-0.794859,-0.940391,-0.580302,-0.341636,-1.192860,...,2.500000e-01,2.500000e-01,0.066987,0.066987,9.330127e-01,2.500000e-01,2.500000e-01,2.500000e-01,2.500000e-01,0.066987
352803,65,17,2022-01-01,1,1,-0.794859,-0.940391,-0.580302,-0.341636,-1.192860,...,2.500000e-01,2.500000e-01,0.066987,0.066987,9.330127e-01,2.500000e-01,2.500000e-01,2.500000e-01,2.500000e-01,0.066987
352802,64,17,2022-01-01,1,2,-0.794859,-0.940391,-0.580302,-0.341636,-1.192860,...,2.500000e-01,2.500000e-01,0.066987,0.066987,9.330127e-01,2.500000e-01,2.500000e-01,2.500000e-01,2.500000e-01,0.066987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345134,43,12,2022-09-30,9,4,-0.285740,-0.388110,-0.245859,-0.315915,2.005748,...,3.169619e-17,3.169619e-17,-0.258819,-0.258819,1.499760e-32,-1.224647e-16,-1.224647e-16,-1.224647e-16,-1.224647e-16,1.000000
345135,44,12,2022-09-30,9,2,-0.285740,-0.388110,-0.245859,-0.315915,2.005748,...,3.169619e-17,3.169619e-17,-0.258819,-0.258819,1.499760e-32,-1.224647e-16,-1.224647e-16,-1.224647e-16,-1.224647e-16,1.000000
345136,45,12,2022-09-30,9,3,-0.285740,-0.388110,-0.245859,-0.315915,2.005748,...,3.169619e-17,3.169619e-17,-0.258819,-0.258819,1.499760e-32,-1.224647e-16,-1.224647e-16,-1.224647e-16,-1.224647e-16,1.000000
345122,29,12,2022-09-30,9,3,-0.285740,-0.388110,-0.245859,-0.315915,2.005748,...,3.169619e-17,3.169619e-17,-0.258819,-0.258819,1.499760e-32,-1.224647e-16,-1.224647e-16,-1.224647e-16,-1.224647e-16,1.000000


In [227]:
# Training data
features = num_viajes_weather.loc[:, ['cluster', 
                                          'hour',
                                          'week_day'
                                          ]]  # features
labels = num_viajes_weather.loc[:, 'trips']  # target

# Prediction data
prediction_features = test.loc[:, ['cluster', 
                                          'hour',
                                          'week_day'
                                          ]]

model.fit(features, labels)

In [228]:
pred = pd.Series(model.predict(prediction_features))

In [229]:
test['prediction']= pred
test['prediction'] = test['prediction'].fillna(0)

PREPARACIÓN DE LA ENTREGA

In [230]:
test.drop_duplicates

<bound method DataFrame.drop_duplicates of                       Id cluster       date  hour  dia_semana  mes season  \
0       72_2022-09-30_22      72 2022-09-30    22           4    9      3   
1       93_2022-09-30_22      93 2022-09-30    22           4    9      3   
2       36_2022-09-30_22      36 2022-09-30    22           4    9      3   
3       83_2022-09-30_22      83 2022-09-30    22           4    9      3   
4       92_2022-09-30_22      92 2022-09-30    22           4    9      3   
...                  ...     ...        ...   ...         ...  ...    ...   
131833  45_2023-01-01_00      45 2023-01-01     0           6    1      4   
131834  59_2023-01-01_00      59 2023-01-01     0           6    1      4   
131835  25_2023-01-01_00      25 2023-01-01     0           6    1      4   
131836  37_2023-01-01_00      37 2023-01-01     0           6    1      4   
131837  85_2023-01-01_00      85 2023-01-01     0           6    1      4   

            tavg      tmin      

In [231]:
test.rename(columns={'prediction': 'trips'}, inplace=True)
test = pd.DataFrame(data=test, columns=["Id", "trips"])
test

Unnamed: 0,Id,trips
0,72_2022-09-30_22,3.169760
1,93_2022-09-30_22,2.007085
2,36_2022-09-30_22,2.215877
3,83_2022-09-30_22,1.066101
4,92_2022-09-30_22,2.686094
...,...,...
131833,45_2023-01-01_00,0.985867
131834,59_2023-01-01_00,2.762728
131835,25_2023-01-01_00,2.581838
131836,37_2023-01-01_00,1.980242


In [232]:
test.to_csv('GBR_1.csv',index=False)

This gave a 1.335 on kaggle. Better than Random Forest