In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

<h2>Reading the Data</h2>

In [2]:
raw=pd.read_csv('product_identifiers.csv')
data=raw.sort_values(by=['energy_per_100g'])
outlier=pd.read_csv('Outliers.csv')
outlier=outlier.drop(['Unnamed: 0'],axis=1)

res=pd.read_csv('[RES] Mean Median ML with Deviations')

<h3>Deleting Outliers from Data</h3>

In [3]:
data['Identifier']=data['Final Generic Product'].astype(str)+' '+data['energy_per_100g'].astype(str)
outlier['Identifier']=outlier['Final Generic Product'].astype(str)+' '+outlier['Outlier Energy'].astype(str)

In [4]:
data=data[~data.Identifier.isin(outlier.Identifier)]
data=data.drop(['Identifier'],axis=1)

<h2>Assign Index for Future Reference</h2>

In [5]:
data=data.dropna(subset=['Final Generic Product'])
data['IDX Final Generic Product'] =data['Final Generic Product'].factorize()[0]

In [6]:
data[['energy_per_100g','Final Generic Product','IDX Final Generic Product']]

Unnamed: 0,energy_per_100g,Final Generic Product,IDX Final Generic Product
23230,0.0,Chicken - prepared - refrigerated,0
22758,0.0,Random Category,1
23244,0.0,Turkey - raw - refrigerated,2
11602,0.0,Flavoured Water,3
11601,0.0,Flavoured Water,3
...,...,...,...
29429,,Sugar Candy - Others - others,1121
29430,,Sugar Candy - Others - fruits,1020
29431,,Sugar Candy - Others - fruits,1020
29432,,Calf - sausage - refrigerated,1969


In [7]:
attributes=['energy_per_100g','IDX Final Generic Product']
data_num=data[attributes]

<h2>Data Cleansing for Labels - Energy</h2>

In [8]:
data_tr=data_num.dropna(subset=['energy_per_100g'])

<h2>Training Set and Test Set</h2>

In [9]:
# Get Training Set
    # Training = 0.75, Test = 0.25
training,testing=train_test_split(data_tr,test_size=0.25,random_state=42)

In [10]:
# Machine Learning Data Preparation
training_set=training.drop('energy_per_100g',axis=1)
training_labels=training['energy_per_100g'].copy()

<h2>ML 1 - Linear Regression</h2>

In [11]:
# Selecting Model and Training

training_prepared=training_set
# training_labels=num_pipeline.fit_transform(training_labels)

lin_reg = LinearRegression()
lin_reg.fit(training_prepared, training_labels)

LinearRegression()

In [12]:
# Pipelining
some_labels = training_labels.iloc[:5]
some_data_prepared = training_set.iloc[:5]

print("Prediction:", lin_reg.predict(some_data_prepared))
print("Label:", list(some_labels))


Prediction: [ 35.05248168  49.22911586 113.63153973  38.69790189  71.10163717]
Label: [152.0, 37.0, 67.0, 17.0, 41.0]


<H3>ML 1 ERROR - Linear Regression </H3>

In [13]:
training_predictions = lin_reg.predict(training_prepared)
lin_mse = mean_squared_error(training_labels, training_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

91.85454179324883

<h2>ML 2 - Tree Regression</h2>

In [14]:
# different model chosen than linear - decision tree
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(training_prepared, training_labels)

DecisionTreeRegressor(random_state=42)

<H3>ML 2 ERROR - Tree Regression </H3>

In [15]:
training_predictions = tree_reg.predict(training_prepared)
tree_mse = mean_squared_error(training_labels, training_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

52.705177067291345

<h2>ML 3 - Random Forest</h2>

In [16]:
# Another Modelling - Ensemble Leaning (Random Forest Regressor)
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
forest_reg.fit(training_prepared, training_labels)

RandomForestRegressor(n_estimators=10, random_state=42)

<H3>ML 3 ERROR - Random Forest </H3>

In [17]:
training_predictions = forest_reg.predict(training_prepared)
forest_mse = mean_squared_error(training_labels, training_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

53.34070446840405

<h2>Implementation ML 2</h2>

In [18]:
data_set=data_tr.drop('energy_per_100g',axis=1)
data_labels=data_tr['energy_per_100g'].copy()
prepared=data_num.drop('energy_per_100g',axis=1)

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(data_set, data_labels)

predictions = tree_reg.predict(prepared)

In [19]:
data_num['Energy Prediction']=predictions
data_num.sort_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_num['Energy Prediction']=predictions


Unnamed: 0,energy_per_100g,IDX Final Generic Product,Energy Prediction
0,,1299,827.666667
1,305.0,957,308.600000
2,248.0,858,251.000000
3,254.0,858,251.000000
4,305.0,957,308.600000
...,...,...,...
29429,,1121,415.250000
29430,,1020,386.500000
29431,,1020,386.500000
29432,,1969,827.666667


<h2>Exporting Results</h2>

In [91]:
res=pd.read_csv('[RES] Mean Median ML with Deviations')
temp=data[['Final Generic Product']]

In [93]:
temp['W/O Outlier ML']=data_num['Energy Prediction']

In [94]:
temp=temp.drop_duplicates(subset=['Final Generic Product'])

In [99]:
res=pd.read_csv('[RES] Mean Median ML with Deviations')
res=res.merge(temp,how='outer',on=['Final Generic Product'])
# naarr=[]
# n=0
# for elem in res['Final Generic Product']:
#     if type(elem)==float:
#         naarr.append(n)
#     n=n+1

In [102]:
def na_kcal(df,col):
    if df[col]>0:
        if df['energy_per_100g']>0:
            pred=df[col]
            exist=df['energy_per_100g']
            return abs((pred-exist)/exist)
        elif df['energy_per_100g']==0:
            return 1
    else:
        return np.nan

In [103]:
res['W/O Outlier ML Deviation']=res.apply(lambda res:na_kcal(res,'W/O Outlier ML'),axis=1)

In [106]:
res.to_csv('[RES] Mean Median ML with Deviations.csv')