In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('selected_col_v2.csv')
data = data.drop(['Unnamed: 0'], axis=1)
data = data.dropna()
# TESTINTG: drop High_bid over 100
data = data[data['High_bid'] < 100]
data.shape


(2039, 171)

In [3]:
data.loc[data['Year_2923'] == 1, 'Year_2023'] = 1
data.loc[data['Year_2923'] == 1, 'Year_2923'] = 0
# data[data['Year_2923'] == 1]
data = data.drop(['Year_2923'], axis=1)
data.shape

(2039, 170)

In [4]:
# year_col = ['Year_2005', 'Year_2007', 'Year_2008', 'Year_2009', 'Year_2010', 'Year_2011', 'Year_2012', 'Year_2013', 'Year_2014', 'Year_2015', 'Year_2016', 'Year_2017', 'Year_2018', 'Year_2019', 'Year_2020', 'Year_2021', 'Year_2022', 'Year_2023']
# country_col = ['Country_Brazil', 'Country_Burundi', 'Country_Colombia', 'Country_Costa Rica', 'Country_Ecuador', 'Country_El Salvador', 'Country_Ethiopia', 'Country_Guatemala', 'Country_Honduras', 'Country_Indonesia', 'Country_México', 'Country_Nicaragua', 'Country_Perú', 'Country_Rwanda']
variety_col = ['geisha', 'pacamara', 'catuai', 'caturra', 'bourbon', 'typica', 'sl-28', 'sl-34', 'colombia', '74112', '74110', '74158', 'mixed_variety']
buyer_col = ['Asia', 'North_America', 'Nordic', 'Europe', 'Others']
farm_col = []
year_col = []
process_col = []
country_col = []
flavor_col = ['Clean_and_clear', 'Balance_cup', 'Transparent_cup', 'Creamy_body', 'Big_body', 'Long_aftertaste', 'Roasted_flavor', 'Spices_flavor', 'NuttyCocoa_flavor', 'Sweet_flavor', 'Floral_flavor', 'Fruity_flavor', 'GreenVegetative_flavor', 'Winey_flavor', 'Malic_acidity', 'Citric_acidity', 'Tartaric_acidity', 'Lactic_acidity', 'Complex_acidity']
for col in data.columns:
    if 'Farm' in col:
        farm_col.append(col)
    elif 'Year' in col:
        year_col.append(col)
    elif 'Process' in col:
        process_col.append(col)
    elif 'Country' in col:
        country_col.append(col)

In [5]:
basic_feature = ['Rank', 'COE_score', 'Altitude']
obj = ['High_bid']
# year_col.remove('Year_2923')
feature_set_1 = obj + basic_feature + year_col + country_col
feature_set_2 = obj + basic_feature + year_col + country_col + variety_col + process_col + buyer_col + flavor_col
                 
feature_set_3 = obj + basic_feature + year_col + country_col + variety_col + process_col + buyer_col + farm_col + flavor_col
                 
## Below is for some other trying
## Just variety and process(no flavor) without buyer
feature_set_4 = obj + basic_feature + year_col + country_col + variety_col + process_col
## Just variety and process(no flavor) with buyer
feature_set_5 = obj + basic_feature + year_col + country_col + variety_col + process_col + buyer_col
## Just flavor (no variety and process) without buyer
feature_set_6 = obj + basic_feature + year_col + country_col + flavor_col      
## Just flavor (no variety and process) with buyer
feature_set_7 = obj + basic_feature + year_col + country_col + flavor_col + buyer_col        

In [6]:
# use feature_set_2 to plot a heatmap of correlation, using plotly
import plotly.express as px
import plotly.graph_objects as go
# since there are too many features, we only plot the heatmap of correlation of features with High_bid (a row)

# correlation matrix
corr = data[feature_set_2].corr()
# only plot the heatmap of correlation of features with High_bid
# corr = corr[['High_bid']].T

# choose the positive correlation features
corr = corr[(corr > 0)]

fig = px.imshow(corr)
fig.show()

In [7]:
buyer_flavor = buyer_col + flavor_col
corr = data[buyer_flavor].corr()
corr = corr[(corr > 0)]
fig = px.imshow(corr)
# rotate the labels
fig.update_xaxes(tickangle=45)
fig.update_yaxes(tickangle=45)
fig.show()

In [8]:
high_bid_variety = obj + variety_col
corr = data[high_bid_variety].corr()
corr = corr[['High_bid']].T
corr = corr[(corr > 0)]
fig = px.imshow(corr)
# rotate the labels
fig.update_xaxes(tickangle=45)
fig.update_yaxes(tickangle=45)
fig.show()

-----

## My functions

In [9]:
def split_xy(data, feature_set):
    x = data[feature_set].drop(['High_bid'], axis=1)
    y = data['High_bid']
    return x, y

def PreProcess(data, feature_set):
    # split data into train and test
    train, test = train_test_split(data, test_size=0.2, random_state=24)
    x_train, y_train = split_xy(train, feature_set)
    x_test, y_test = split_xy(test, feature_set)

    # standardize feature values
    xscaler = preprocessing.StandardScaler().fit(x_train[['COE_score', 'Altitude']])
    # standardize feature values
    x_train_conti = xscaler.transform(x_train[['COE_score', 'Altitude']])
    x_train = np.concatenate((x_train_conti, x_train.drop(['COE_score', 'Altitude'], axis=1)), axis=1)
    x_test_conti = xscaler.transform(x_test[['COE_score', 'Altitude']])
    x_test = np.concatenate((x_test_conti, x_test.drop(['COE_score', 'Altitude'], axis=1)), axis=1)
    return x_train, y_train, x_test, y_test


In [10]:

# calculate the metrics of regression
def regression_metrics(y_test, y_pred):
    mse = np.mean((y_test - y_pred)**2)
    mae = np.mean(np.abs(y_test - y_pred))
    rmse = np.sqrt(mse)
    return mse, mae, rmse


In [11]:
def LR(x_train, y_train, x_test, y_test):
    lr_model = LinearRegression()
    lr_model.fit(x_train, y_train)
    y_pred = lr_model.predict(x_test)
    # return MAE, RMSE
    mse = np.mean((y_test - y_pred)**2)
    mae = np.mean(np.abs(y_test - y_pred))
    rmse = np.sqrt(mse)
    return y_pred, lr_model.score(x_test, y_test), mse, mae, rmse

def LASSO(x_train, y_train, x_test, y_test):
    lasso = linear_model.Lasso(alpha=0.1)
    lasso.fit(x_train, y_train)
    y_pred = lasso.predict(x_test)
    mse = np.mean((y_test - y_pred)**2)
    mae = np.mean(np.abs(y_test - y_pred))
    rmse = np.sqrt(mse)
    # print the parameters of the model
    # print('LASSO param: ',lasso.coef_)
    return y_pred, lasso.score(x_test, y_test), mse, mae, rmse

def Ridge(x_train, y_train, x_test, y_test):
    ridge = linear_model.Ridge(alpha=0.1)
    ridge.fit(x_train, y_train)
    y_pred = ridge.predict(x_test)
    mse = np.mean((y_test - y_pred)**2)
    mae = np.mean(np.abs(y_test - y_pred))
    rmse = np.sqrt(mse)
    # print the parameters of the model
    # print('Ridge param: ',ridge.coef_)
    return y_pred, ridge.score(x_test, y_test), mse, mae, rmse

def RandomForest(x_train, y_train, x_test, y_test):
    rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=0)
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    mse = np.mean((y_test - y_pred)**2)
    mae = np.mean(np.abs(y_test - y_pred))
    rmse = np.sqrt(mse)
    # print('RandomForest param: ',rf.feature_importances_)
    
    return y_pred, rf.score(x_test, y_test), mse, mae, rmse

def GradientBoosting(x_train, y_train, x_test, y_test):
    gb = GradientBoostingRegressor(n_estimators=100, max_depth=10, random_state=0)
    gb.fit(x_train, y_train)
    y_pred = gb.predict(x_test)
    mse = np.mean((y_test - y_pred)**2)
    mae = np.mean(np.abs(y_test - y_pred))
    rmse = np.sqrt(mse)
    # print('gradient:',gb.get_params())
    return y_pred, gb.score(x_test, y_test), mse, mae, rmse

def XGBoost(x_train, y_train, x_test, y_test):
    xgb = XGBRegressor(n_estimators=100, max_depth=10, random_state=0)
    xgb.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    mse = np.mean((y_test - y_pred)**2)
    mae = np.mean(np.abs(y_test - y_pred))
    rmse = np.sqrt(mse)
    # print('XGBoost param: ',xgb.get_xgb_params())
    return y_pred, xgb.score(x_test, y_test), mse, mae, rmse

def plot_pred_by_index(y_test, y_pred):
    # plot the predicted values and the true values, using plotly
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=np.arange(len(y_test)), y=y_test, mode='lines', name='True'))
    fig.add_trace(go.Scatter(x=np.arange(len(y_test)), y=y_pred, mode='lines', name='Predicted'))
    fig.update_layout(title='True and Predicted High_bid', xaxis_title='Index', yaxis_title='High_bid')
    fig.show()

def plot_pred(y_test, y_pred, model, feature_set):
    # plot the scatter plot of predicted values and the true values, using plotly
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=y_test, y=y_pred, mode='markers'))
    fig.update_layout(title=f'True and Predicted High_bid of {model} using feature set {feature_set}', xaxis_title='True', yaxis_title='Predicted')
    if (feature_set == 'rf25'):
        fig.update_layout(title=f'True and Predicted High_bid of {model} using 25 features selected', xaxis_title='True', yaxis_title='Predicted')
    elif feature_set == 'rf50':
        fig.update_layout(title=f'True and Predicted High_bid of {model} using 50 features selected', xaxis_title='True', yaxis_title='Predicted')
    elif feature_set == 'rf75':
        fig.update_layout(title=f'True and Predicted High_bid of {model} using 75 features selected', xaxis_title='True', yaxis_title='Predicted')
    elif feature_set == 'rf100':
        fig.update_layout(title=f'True and Predicted High_bid of {model} using 100 features selected', xaxis_title='True', yaxis_title='Predicted')
    elif feature_set == 'joined37':
        fig.update_layout(title=f'True and Predicted High_bid of model joined using 37 features selected', xaxis_title='True', yaxis_title='Predicted')
    # add a diagonal line
    fig.add_trace(go.Scatter(x=[0, 100], y=[0, 100], mode='lines', name='Diagonal'))
    fig.show()
    # clear the figure
    fig.data = []

def plot_pred_1(y_test, y_pred, model, feature_set): # use seaborn
    # plot the scatter plot of predicted values and the true values, using seaborn
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(y_test, y_pred, c='b', marker='o')
    ax.plot([0, 100], [0, 100], 'r-', lw=2)
    ax.set_xlabel('True')
    ax.set_ylabel('Predicted')
    ax.set_title(f'True and Predicted High_bid of {model} using feature set {feature_set}')
    plt.show()
    

-----

## Feature Selection
Use Randon Forest and XGBoost to see the importance of each feature. We decide to use the ones from Random Forset.

In [12]:
# plot the feature importance of xgboost
feature_set = feature_set_3
x_train, y_train, x_test, y_test = PreProcess(data, feature_set)

xgb = XGBRegressor(n_estimators=100, max_depth=10, random_state=0)
xgb.fit(x_train, y_train)
features = data[feature_set].drop(['High_bid'], axis=1).columns
print(len(features), len(xgb.feature_importances_))
feature_importance_xgb = pd.DataFrame({'feature': features, 'importance': xgb.feature_importances_})
feature_importance_xgb = feature_importance_xgb.sort_values(by='importance', ascending=False)
fig = px.bar(feature_importance_xgb, x='feature', y='importance')
fig.show()

169 169


In [13]:
# plot the feature importance of RandomForest
feature_set = feature_set_3
x_train, y_train, x_test, y_test = PreProcess(data, feature_set)
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=0)
rf.fit(x_train, y_train)
# feature set = feature_set_3 removing High_bid
features = data[feature_set].drop(['High_bid'], axis=1).columns
print(len(features), len(rf.feature_importances_))
feature_importance_rf = pd.DataFrame({'feature': features, 'importance': rf.feature_importances_})
feature_importance_rf = feature_importance_rf.sort_values(by='importance', ascending=False)
fig = px.bar(feature_importance_rf, x='feature', y='importance')
fig.show()

169 169


In [14]:
# the first 25, 50, 75, 100 important features
features_25 = feature_importance_rf.iloc[:25, :]
features_25 = features_25['feature'].values
features_rf25 = list(features_25)

features_50 = feature_importance_rf.iloc[:50, :]
features_50 = features_50['feature'].values
features_rf50 = list(features_50)

features_75 = feature_importance_rf.iloc[:75, :]
features_75 = features_75['feature'].values
features_rf75 = list(features_75)

features_100 = feature_importance_rf.iloc[:100, :]
features_100 = features_100['feature'].values
features_rf100 = list(features_100)


print(features_rf25, features_rf50, features_rf75, features_rf100)

features_rf25.append('High_bid')
features_rf50.append('High_bid')
features_rf75.append('High_bid')
features_rf100.append('High_bid')

feature_set_rf25 = features_rf25
feature_set_rf50 = features_rf50
feature_set_rf75 = features_rf75
feature_set_rf100 = features_rf100

['Altitude', 'Fruity_flavor', 'Rank', 'geisha', 'COE_score', 'Citric_acidity', 'Floral_flavor', 'Sweet_flavor', 'NuttyCocoa_flavor', 'Year_2021', 'Country_Nicaragua', 'Tartaric_acidity', 'Farm_santa rosa', 'Year_2020', 'GreenVegetative_flavor', 'Complex_acidity', 'Process_honey', 'Spices_flavor', 'Malic_acidity', 'Winey_flavor', 'Country_Guatemala', 'Process_anaerobic', 'Country_Costa Rica', 'typica', 'Farm_el cerro'] ['Altitude', 'Fruity_flavor', 'Rank', 'geisha', 'COE_score', 'Citric_acidity', 'Floral_flavor', 'Sweet_flavor', 'NuttyCocoa_flavor', 'Year_2021', 'Country_Nicaragua', 'Tartaric_acidity', 'Farm_santa rosa', 'Year_2020', 'GreenVegetative_flavor', 'Complex_acidity', 'Process_honey', 'Spices_flavor', 'Malic_acidity', 'Winey_flavor', 'Country_Guatemala', 'Process_anaerobic', 'Country_Costa Rica', 'typica', 'Farm_el cerro', 'North_America', 'Lactic_acidity', 'Farm_el paraiso', 'Long_aftertaste', 'Process_natural', 'Year_2016', 'pacamara', 'Country_Ecuador', 'Creamy_body', 'Asia

In [15]:
feature_set_rf50

['Altitude',
 'Fruity_flavor',
 'Rank',
 'geisha',
 'COE_score',
 'Citric_acidity',
 'Floral_flavor',
 'Sweet_flavor',
 'NuttyCocoa_flavor',
 'Year_2021',
 'Country_Nicaragua',
 'Tartaric_acidity',
 'Farm_santa rosa',
 'Year_2020',
 'GreenVegetative_flavor',
 'Complex_acidity',
 'Process_honey',
 'Spices_flavor',
 'Malic_acidity',
 'Winey_flavor',
 'Country_Guatemala',
 'Process_anaerobic',
 'Country_Costa Rica',
 'typica',
 'Farm_el cerro',
 'North_America',
 'Lactic_acidity',
 'Farm_el paraiso',
 'Long_aftertaste',
 'Process_natural',
 'Year_2016',
 'pacamara',
 'Country_Ecuador',
 'Creamy_body',
 'Asia',
 'Year_2022',
 'Farm_platanares',
 'Roasted_flavor',
 'Farm_san luis',
 'Process_washed',
 'Country_Ethiopia',
 'caturra',
 'Country_El Salvador',
 'Country_México',
 'Country_Burundi',
 'Europe',
 'Clean_and_clear',
 'Country_Colombia',
 'Country_Brazil',
 'Country_Honduras',
 'High_bid']

## Joined one

In [16]:
# 取交集 feature_importance_rf 前50和 feature_importance_xgb前50
feature_importance_rf = feature_importance_rf[:50]
feature_importance_xgb = feature_importance_xgb[:50]
feature_set = feature_importance_rf.merge(feature_importance_xgb, on='feature')
feature_set = feature_set['feature'].tolist()
feature_set = feature_set + ['High_bid']
print(len(feature_set))
feature_set_joined37 = feature_set.copy()

37


------

In [17]:
plot_result = 1
models = ['LR', 'LASSO', 'Ridge', 'GradientBoosting', 'RandomForest', 'XGBoost']
feature_sets = ['1', '2', '3', 'rf25', 'rf50', 'rf75', 'rf100', 'joined37']

score_df = pd.DataFrame(columns=['LR', 'LASSO', 'Ridge', 'GradientBoosting', 'RandomForest', 'XGBoost'])
for f in feature_sets:
    feature_set = eval('feature_set_' + f)
    x_train, y_train, x_test, y_test = PreProcess(data, feature_set)
    for m in models:
        y_pred, score, mse, mae, rmse = eval(m)(x_train, y_train, x_test, y_test)
        score_df.loc[f, m] = rmse
        if plot_result:
            plot_pred(y_test, y_pred, m, f)
score_df

Unnamed: 0,LR,LASSO,Ridge,GradientBoosting,RandomForest,XGBoost
1,12.618951,12.967843,12.620098,8.53662,8.417946,8.165249
2,12.086606,12.55708,12.09133,9.008833,8.251548,8.292776
3,11.826794,12.55708,11.829681,9.029048,8.265557,7.962745
rf25,12.593223,12.854809,12.594112,9.166847,8.421753,8.432158
rf50,12.190302,12.709752,12.192449,9.121808,8.218736,7.747973
rf75,12.118666,12.557125,12.12085,9.1261,8.172778,7.706964
rf100,12.103448,12.557125,12.103081,9.121754,8.274796,7.799832
joined37,12.196543,12.720981,12.198,9.633894,8.16817,8.567072


## Result table:
#### Values: RMSE of each model and different feature set

In [18]:
output = score_df.round(4).copy()
output.index = ['1', '2', '3', 'rf25', 'rf50', 'rf75', 'rf100', 'joined37']
output['feature_set'] = ['1', '2', '3', 'rf25', 'rf50', 'rf75', 'rf100', 'joined37']
# output = output.iloc[:7,:]
fig = go.Figure(data=[go.Table(
    header=dict(values=['feature set','LR', 'LASSO', 'Ridge', 'GradientBoosting', 'RandomForest', 'XGBoost'],
                fill_color='lightgrey',
                align='left'),
    ids = list(output.index),
    cells=dict(values=[output.feature_set,output.LR.astype(float).round(4), output.LASSO.astype(float).round(4), output.Ridge.astype(float).round(4), output.GradientBoosting.astype(float).round(4), output.RandomForest.astype(float).round(4), output.XGBoost.astype(float).round(4)],
               fill_color='white',
               align='left'))
])


fig.show()

## Discussing the outliers

In [19]:
plot_result = True
f = '3'
feature_set = eval('feature_set_' + f)
x_train, y_train, x_test, y_test = PreProcess(data, feature_set)
m = 'XGBoost'
y_pred, score, mse, mae, rmse = eval(m)(x_train, y_train, x_test, y_test)
fig = go.Figure()
fig.add_trace(go.Scatter(x=y_test, y=y_pred, mode='markers'))
fig.update_layout(title=f'True and Predicted High_bid of {m} using feature set rf50', xaxis_title='True', yaxis_title='Predicted')
fig.add_trace(go.Scatter(x=[0, 100], y=[0, 100], mode='lines', name='Diagonal'))

# add vertical line at 90.7 and 75, colored yellow, width=3.2, dash='dash'
fig.add_trace(go.Scatter(x=[90.7, 90.7], y=[0, 100], mode='lines', name='90.7', line=dict(color='yellow', width=3.2, dash='dash')))
fig.add_trace(go.Scatter(x=[75, 75], y=[0, 100], mode='lines', name='75', line=dict(color='yellow', width=3.2, dash='dash')))
fig.add_trace(go.Scatter(x=[66.6, 66.6], y=[0, 100], mode='lines', name='75', line=dict(color='yellow', width=3.2, dash='dash')))
fig.show()
# score_df.loc[f, m] = rmse
# if plot_result:
#     plot_pred(y_test, y_pred, m, f)

In [20]:
compare = [90.7,92.2,75,75.1,67.4,66.9]
# show all colums 
pd.set_option('display.max_columns', None)
data[data['High_bid'].isin(compare)]

Unnamed: 0,Rank,COE_score,High_bid,Clean_and_clear,Balance_cup,Transparent_cup,Creamy_body,Big_body,Long_aftertaste,Roasted_flavor,Spices_flavor,NuttyCocoa_flavor,Sweet_flavor,Floral_flavor,Fruity_flavor,GreenVegetative_flavor,Winey_flavor,Malic_acidity,Citric_acidity,Tartaric_acidity,Lactic_acidity,Complex_acidity,geisha,pacamara,catuai,caturra,bourbon,typica,sl-28,sl-34,colombia,74112,74110,74158,mixed_variety,Altitude,Asia,North_America,Nordic,Europe,Others,Farm_agua dulce,Farm_bella aurora,Farm_bella elizabeth,Farm_bella vista,Farm_betania,Farm_buena vista,Farm_buenos aires,Farm_chiriloma,Farm_divina providencia,Farm_don cayito,Farm_el aguacate,Farm_el bosque,Farm_el cairo,Farm_el cambalache,Farm_el cedro,Farm_el centro,Farm_el cerro,Farm_el conacaste,Farm_el diamante,Farm_el equimite,Farm_el espejo,Farm_el guayabo,Farm_el injerto i,Farm_el matazano,Farm_el mirador,Farm_el morito i,Farm_el nacimiento no.,Farm_el naranjo,Farm_el paraiso,Farm_el paraxaj,Farm_el pino,Farm_el plan,Farm_el porvenir,Farm_el recuerdo,Farm_el roble,Farm_el socorro,Farm_el tambor,Farm_el vergel,Farm_el zapote y anexos,Farm_esperanza,Farm_finca santa cruz,Farm_gakenke,Farm_guatalon,Farm_kalibus la sierra,Farm_la bella,Farm_la bendicion,Farm_la colmena,Farm_la esmeralda,Farm_la españa,Farm_la esperanza,Farm_la fortuna,Farm_la hermosa,Farm_la laguna,Farm_la loma,Farm_la lucuma,Farm_la mina,Farm_la montañita,Farm_la orquidea,Farm_la palma,Farm_la picona,Farm_la planada,Farm_la pradera,Farm_la reforma y anexos,Farm_la salsa,Farm_las brisas,Farm_las brumas,Farm_las delicias,Farm_las duanas,Farm_las flores,Farm_las macadamias,Farm_las nubes,Farm_las palmas,Farm_las ventanas,Farm_las virginias,Farm_liquidambar,Farm_los aguacates,Farm_los morales,Farm_los pinos,Farm_los pirineos,Farm_los robles,Farm_mileydi,Farm_mpanga,Farm_nueva alianza,Farm_nuevo progreso,Farm_ojo de agua,Farm_peña redonda,Farm_plan de la vega,Farm_platanares,Farm_rosma,Farm_san isidro,Farm_san jose,Farm_san luis,Farm_san rafael,Farm_santa elena,Farm_santa lucia,Farm_santa rosa,Farm_santa teresa,Farm_un regalo de dios,Farm_villaure,Farm_yandaro,Country_Brazil,Country_Burundi,Country_Colombia,Country_Costa Rica,Country_Ecuador,Country_El Salvador,Country_Ethiopia,Country_Guatemala,Country_Honduras,Country_México,Country_Nicaragua,Country_Perú,Country_Rwanda,Year_2013,Year_2014,Year_2015,Year_2016,Year_2017,Year_2018,Year_2019,Year_2020,Year_2021,Year_2022,Year_2023,Process_anaerobic,Process_honey,Process_natural,Process_washed,Process_wet hulled
326,1,90.65,92.2,1,0,0,0,0,0,0,0,1,3,4,17,0,0,2,2,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,2015.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
540,2,90.4,75.1,0,0,0,0,0,0,0,1,2,8,1,17,0,0,1,2,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1660.0,1.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
653,3,90.07,75.1,0,0,0,1,0,0,1,0,5,8,3,15,2,0,1,6,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1897.0,1.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
1368,1,90.27,66.9,0,0,0,0,0,0,0,1,1,2,5,12,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1950.0,1.0,1.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
1608,1,90.39,75.0,0,1,0,0,0,1,0,0,1,0,2,4,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,96.570053,1.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1938,20,87.97,90.7,1,0,0,0,0,1,0,3,3,3,1,18,1,0,1,2,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2367.395996,1.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
1940,22,87.82,67.4,1,0,0,0,0,1,1,5,5,2,1,16,0,0,1,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2336.048828,1.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0


-----

In [21]:
# plot rmse of different models using different feature sets, using plotly. line plot
fig = go.Figure()
for m in models:
    fig.add_trace(go.Scatter(x=feature_sets, y=score_df[m], mode='lines', name=m))
fig.update_layout(title='RMSE of different models using different feature sets', xaxis_title='Feature set', yaxis_title='RMSE')
fig.show()

In [22]:
# output = score_df.copy()
# output.index = ['1', '2', '3', 'rf25', 'rf50', 'rf75', 'rf100', 'joined39']
# output['feature_set'] = ['1', '2', '3', 'rf25', 'rf50', 'rf75', 'rf100', 'joined39']
# output = output.iloc[3:7,:]
# fig = go.Figure(data=[go.Table(
#     header=dict(values=['feature set','LR', 'LASSO', 'Ridge', 'GradientBoosting', 'RandomForest', 'XGBoost'],
#                 fill_color='lightgrey',
#                 align='left'),
#     ids = list(output.index),
#     cells=dict(values=[output.feature_set,output.LR.astype(float).round(4), output.LASSO.astype(float).round(4), output.Ridge.astype(float).round(4), output.GradientBoosting.astype(float).round(4), output.RandomForest.astype(float).round(4), output.XGBoost.astype(float).round(4)],
#                fill_color='white',
#                align='left'))
# ])


# fig.show()

In [23]:
# output = score_df.copy()
# # output.index = ['1', '2', '3', 'xgb25', 'xgb50', 'xgb75', 'xgb100', 'joined39']
# output.index = ['1', '2', '3', 'xgb25', 'xgb50', 'xgb75', 'xgb100', 'joined39']
# output['feature_set'] = ['1', '2', '3', 'xgb25', 'xgb50', 'xgb75', 'xgb100', 'joined39']
# output = output.iloc[6:7,:]
# fig = go.Figure(data=[go.Table(
#     header=dict(values=['feature set','LR', 'LASSO', 'Ridge', 'GradientBoosting', 'RandomForest', 'XGBoost'],
#                 fill_color='lightgrey',
#                 align='left'),
#     ids = list(output.index),
#     cells=dict(values=[output.feature_set,output.LR.astype(float).round(4), output.LASSO.astype(float).round(4), output.Ridge.astype(float).round(4), output.GradientBoosting.astype(float).round(4), output.RandomForest.astype(float).round(4), output.XGBoost.astype(float).round(4)],
#                fill_color='white',
#                align='left'))
# ])


# fig.show()

Ecuador 2022 (2), Ecuador 2022 (4) have bo description of flavor
El Salvador 2019 (3) is from a well known farm
Brazil 2022 (5) have no description of flavor
Colombia 2021 (12) is from a well known farm
Ethiopia 2022 (9) is from a well known farmer


-----

In [24]:
# 20 most important coefficients in LR
feature_set = feature_set_3
x_train, y_train, x_test, y_test = PreProcess(data, feature_set)
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)
# feature set = feature_set_3 removing High_bid
features = data[feature_set].drop(['High_bid'], axis=1).columns
print(len(features), len(lr_model.coef_))
feature_importance = pd.DataFrame({'feature': features, 'coefficient': lr_model.coef_})
feature_importance = feature_importance.sort_values(by='coefficient', ascending=False)
fig = px.bar(feature_importance, x='feature', y='coefficient')
fig.show()


169 169
