In [130]:
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.cbook import boxplot_stats
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_regression
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb

# Pre Processing

In [131]:
df = pd.read_csv("Final.csv")
df.head()
df.fillna("NaN", inplace=True)
df["ORDER_CREATION_DATE"] = pd.to_datetime(df["ORDER_CREATION_DATE"], format="%Y%m%d")
df["REQUESTED_DELIVERY_DATE"] = pd.to_datetime(df["REQUESTED_DELIVERY_DATE"], format="%Y%m%d")
df.drop(df[(df["ORDER_CREATION_DATE"] > df["REQUESTED_DELIVERY_DATE"])].index, inplace=True)
df["ORDER_AMOUNT"] = df["ORDER_AMOUNT"].str.replace("-", "")
df["ORDER_AMOUNT"] = df["ORDER_AMOUNT"].str.replace(",", ".")
df["RELEASED_CREDIT_VALUE"] = df["RELEASED_CREDIT_VALUE"].str.replace("-", "")
df["RELEASED_CREDIT_VALUE"] = df["RELEASED_CREDIT_VALUE"].str.replace(",", ".")
df["ORDER_CURRENCY"] = df["ORDER_CURRENCY"].replace("HU1", "HUF")
apikey = "347adc49463e4adfafd55bba3192ed32"
url = f"https://openexchangerates.org/api/latest.json?app_id={apikey}"
response = requests.get(url)
data = response.json()
exchange_rates = data["rates"]
def convert_to_usd(row):
    amount = row['ORDER_AMOUNT']
    currency = row['ORDER_CURRENCY']
    if currency != 'USD':
        return float(amount) / exchange_rates.get(currency, 1)
    return amount


df['amount_in_usd'] = df.apply(convert_to_usd, axis=1)
df["UNIQUE_CUST_ID"] = df["CUSTOMER_NUMBER"].astype(str) + df["COMPANY_CODE"].astype(str)
df = df.sort_values('ORDER_CREATION_DATE')


In [132]:
df['amount_in_usd']=df['amount_in_usd'].astype(float)

In [133]:
df_adjusted = df[['UNIQUE_CUST_ID','ORDER_CREATION_DATE','amount_in_usd']]

In [134]:
df_adjusted['ORDER_CREATION_DATE'] = pd.to_datetime(df_adjusted['ORDER_CREATION_DATE'])
df_adjusted['ORDER_MONTH'] = df_adjusted['ORDER_CREATION_DATE'].dt.month
monthly_data = {}
for month in range(1, 13):
    monthly_data[month] = df_adjusted[df_adjusted['ORDER_MONTH'] == month]
train_data_adjusted = pd.concat([monthly_data[1],monthly_data[2],monthly_data[3],monthly_data[4]], ignore_index=True)
test_data_adjusted = pd.concat([monthly_data[5],monthly_data[6]], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_adjusted['ORDER_CREATION_DATE'] = pd.to_datetime(df_adjusted['ORDER_CREATION_DATE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_adjusted['ORDER_MONTH'] = df_adjusted['ORDER_CREATION_DATE'].dt.month


In [135]:
Q1 = train_data_adjusted.amount_in_usd.astype(float).quantile(0.25)
Q3 = train_data_adjusted.amount_in_usd.astype(float).quantile(0.75)
IQR = Q3-Q1
lb = (Q1-2.2*IQR)
ub = (Q3+2.2*IQR)

print(str(lb)+" "+str(ub))
mean_replace = np.mean(train_data_adjusted[~((train_data_adjusted.amount_in_usd.astype(float) < lb) | (train_data_adjusted.amount_in_usd.astype(float) > ub))]['amount_in_usd'].astype(float))
print(str(mean_replace))
train_data_adjusted['amount_in_usd'] = train_data_adjusted['amount_in_usd'].astype(float)
test_data_adjusted['amount_in_usd'] = test_data_adjusted['amount_in_usd'].astype(float)

train_data_adjusted['amount_in_usd'].loc[(train_data_adjusted['amount_in_usd'] < lb) | (train_data_adjusted['amount_in_usd'] > ub)]=ub

-1544.070207451883 2291.698142009741
395.03193125009534


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_adjusted['amount_in_usd'].loc[(train_data_adjusted['amount_in_usd'] < lb) | (train_data_adjusted['amount_in_usd'] > ub)]=ub


In [136]:
train_data_adjusted = train_data_adjusted.groupby(['UNIQUE_CUST_ID', 'ORDER_CREATION_DATE'], as_index=False).agg({'amount_in_usd': 'sum'})
train_data_adjusted = train_data_adjusted.sort_values('ORDER_CREATION_DATE')
train_data_adjusted.head()  


Unnamed: 0,UNIQUE_CUST_ID,ORDER_CREATION_DATE,amount_in_usd
20314,12104172983000,2022-01-01,1663.130243
38307,123001123350,2022-01-01,416.977414
21318,12104312003030,2022-01-01,746.491071
51983,123118073220,2022-01-01,1874.676473
21196,12104307433030,2022-01-01,1115.045908


In [137]:
test_data_adjusted = test_data_adjusted.groupby(['UNIQUE_CUST_ID', 'ORDER_CREATION_DATE'], as_index=False).agg({'amount_in_usd': 'sum'})
test_data_adjusted = test_data_adjusted.sort_values('ORDER_CREATION_DATE')
test_data_adjusted.head()  


Unnamed: 0,UNIQUE_CUST_ID,ORDER_CREATION_DATE,amount_in_usd
4061,12103577693000,2022-05-01,1114.588043
2944,12101795693260,2022-05-01,0.0
4640,12104088433030,2022-05-01,353.33783
3921,12103494423000,2022-05-01,186.617381
2935,12101784533000,2022-05-01,152.71399


In [139]:
def difference_in_days(melt, lags, ffday, customer_id_col, create_date_col, net_amount_col):
    for i in range(ffday, lags+1):
        melt['Last-'+str(i)+'day_Sales'] = melt.groupby([customer_id_col])[net_amount_col].shift(i)

    melt = melt.reset_index(drop = True)

    for i in range(ffday, lags+1):
        melt['Last-'+str(i)+'day_Diff']  = melt.groupby([customer_id_col])['Last-'+str(i)+'day_Sales'].diff()
    melt = melt.fillna(0)
    return melt

train_data_adjusted = difference_in_days(train_data_adjusted,7,1,'UNIQUE_CUST_ID','ORDER_CREATION_DATE','amount_in_usd')
test_data_adjusted = difference_in_days(test_data_adjusted,7,1,'UNIQUE_CUST_ID','ORDER_CREATION_DATE','amount_in_usd')



In [140]:
train_data_adjusted.columns

Index(['UNIQUE_CUST_ID', 'ORDER_CREATION_DATE', 'amount_in_usd',
       'Last-1day_Sales', 'Last-2day_Sales', 'Last-3day_Sales',
       'Last-4day_Sales', 'Last-5day_Sales', 'Last-6day_Sales',
       'Last-7day_Sales', 'Last-1day_Diff', 'Last-2day_Diff', 'Last-3day_Diff',
       'Last-4day_Diff', 'Last-5day_Diff', 'Last-6day_Diff', 'Last-7day_Diff'],
      dtype='object')

In [141]:
df.columns

Index(['CUSTOMER_ORDER_ID', 'SALES_ORG', 'DISTRIBUTION_CHANNEL', 'DIVISION',
       'RELEASED_CREDIT_VALUE', 'PURCHASE_ORDER_TYPE', 'COMPANY_CODE',
       'ORDER_CREATION_DATE', 'ORDER_CREATION_TIME', 'CREDIT_CONTROL_AREA',
       'SOLD_TO_PARTY', 'ORDER_AMOUNT', 'REQUESTED_DELIVERY_DATE',
       'ORDER_CURRENCY', 'CREDIT_STATUS', 'CUSTOMER_NUMBER', 'amount_in_usd',
       'UNIQUE_CUST_ID'],
      dtype='object')

# Splitting

In [142]:
df['ORDER_CREATION_DATE'] = pd.to_datetime(df['ORDER_CREATION_DATE'])
df['ORDER_MONTH'] = df['ORDER_CREATION_DATE'].dt.month
monthly_data = {}
for month in range(1, 13):
    monthly_data[month] = df[df['ORDER_MONTH'] == month]
train_data = pd.concat([monthly_data[1],monthly_data[2],monthly_data[3],monthly_data[4]], ignore_index=True)
test_data = pd.concat([monthly_data[5],monthly_data[6]], ignore_index=True)

In [143]:
train_data = train_data.merge(train_data_adjusted, on=['UNIQUE_CUST_ID', 'ORDER_CREATION_DATE'], how='inner')
test_data = test_data.merge(test_data_adjusted, on=['UNIQUE_CUST_ID', 'ORDER_CREATION_DATE'], how='inner')


In [144]:
train_data.rename(columns={'amount_in_usd_x': 'amount_in_usd'}, inplace=True)
train_data.rename(columns={'amount_in_usd_y': 'net_amount_in_usd'}, inplace=True)

test_data.rename(columns={'amount_in_usd_x': 'amount_in_usd'}, inplace=True)
test_data.rename(columns={'amount_in_usd_y': 'net_amount_in_usd'}, inplace=True)

# EDA and Feature Engineering

In [145]:
Q1 = train_data.amount_in_usd.astype(float).quantile(0.25)
Q3 = train_data.amount_in_usd.astype(float).quantile(0.75)
IQR = Q3-Q1
lb = (Q1-2.2*IQR)
ub = (Q3+2.2*IQR)

print(str(lb)+" "+str(ub))
mean_replace = np.mean(train_data[~((train_data.amount_in_usd.astype(float) < lb) | (train_data.amount_in_usd.astype(float) > ub))]['amount_in_usd'].astype(float))
print(str(mean_replace))
train_data['amount_in_usd'] = train_data['amount_in_usd'].astype(float)
test_data['amount_in_usd'] = test_data['amount_in_usd'].astype(float)

train_data['amount_in_usd'].loc[(train_data['amount_in_usd'] < lb) | (train_data['amount_in_usd'] > ub)]=ub
# test_data['amount_in_usd'].loc[(test_data['amount_in_usd'] < lb) | (test_data['amount_in_usd'] > ub)]=mean_replace


-1544.070207451883 2291.698142009741
395.0319312500955


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['amount_in_usd'].loc[(train_data['amount_in_usd'] < lb) | (train_data['amount_in_usd'] > ub)]=ub


In [146]:
train_data['DISTRIBUTION_CHANNEL'] = train_data['DISTRIBUTION_CHANNEL'].astype(str)
train_data['DIVISION'] = train_data['DIVISION'].astype(str)
train_data['PURCHASE_ORDER_TYPE'] = train_data['PURCHASE_ORDER_TYPE'].astype(str)
train_data['CREDIT_CONTROL_AREA'] = train_data['CREDIT_CONTROL_AREA'].astype(str)
train_data['CREDIT_STATUS'] = train_data['CREDIT_STATUS'].astype(str)

test_data['DISTRIBUTION_CHANNEL'] = test_data['DISTRIBUTION_CHANNEL'].astype(str)
test_data['DIVISION'] = test_data['DIVISION'].astype(str)
test_data['PURCHASE_ORDER_TYPE'] = test_data['PURCHASE_ORDER_TYPE'].astype(str)
test_data['CREDIT_CONTROL_AREA'] = test_data['CREDIT_CONTROL_AREA'].astype(str)
test_data['CREDIT_STATUS'] = test_data['CREDIT_STATUS'].astype(str)

In [147]:
train_data['EXPECTED_DELIVERY_TIME'] = (train_data['REQUESTED_DELIVERY_DATE'] - train_data['ORDER_CREATION_DATE']).dt.days
test_data['EXPECTED_DELIVERY_TIME'] = (test_data['REQUESTED_DELIVERY_DATE'] - test_data['ORDER_CREATION_DATE']).dt.days

In [148]:

categorical_columns = ['SALES_ORG', 'DISTRIBUTION_CHANNEL', 'DIVISION', 'CREDIT_CONTROL_AREA', 'CREDIT_STATUS','UNIQUE_CUST_ID']
# label_encoder = LabelEncoder()
# for column in categorical_columns:
#     label_encoder.fit(train_data[column])
#     train_data[column] = label_encoder.transform(train_data[column].astype(str))
#     test_data[column] = label_encoder.transform(test_data[column].astype(str))


le = LabelEncoder()
for column in categorical_columns:
    train_data[column] = le.fit_transform(train_data[column])
    dic = dict(zip(le.classes_, le.transform(le.classes_)))
    test_data[column]=test_data[column].map(dic).fillna(6474)


In [149]:
test_data[test_data[['SALES_ORG', 'DISTRIBUTION_CHANNEL', 'DIVISION', 'CREDIT_CONTROL_AREA', 'CREDIT_STATUS','UNIQUE_CUST_ID']]=="NaN"].count()

CUSTOMER_ORDER_ID          0
SALES_ORG                  0
DISTRIBUTION_CHANNEL       0
DIVISION                   0
RELEASED_CREDIT_VALUE      0
PURCHASE_ORDER_TYPE        0
COMPANY_CODE               0
ORDER_CREATION_DATE        0
ORDER_CREATION_TIME        0
CREDIT_CONTROL_AREA        0
SOLD_TO_PARTY              0
ORDER_AMOUNT               0
REQUESTED_DELIVERY_DATE    0
ORDER_CURRENCY             0
CREDIT_STATUS              0
CUSTOMER_NUMBER            0
amount_in_usd              0
UNIQUE_CUST_ID             0
ORDER_MONTH                0
net_amount_in_usd          0
Last-1day_Sales            0
Last-2day_Sales            0
Last-3day_Sales            0
Last-4day_Sales            0
Last-5day_Sales            0
Last-6day_Sales            0
Last-7day_Sales            0
Last-1day_Diff             0
Last-2day_Diff             0
Last-3day_Diff             0
Last-4day_Diff             0
Last-5day_Diff             0
Last-6day_Diff             0
Last-7day_Diff             0
EXPECTED_DELIV

In [None]:
# def difference_in_days(melt, lags, ffday, customer_id_col, create_date_col, net_amount_col):
#     for i in range(ffday, lags+1):
#         melt['Last-'+str(i)+'day_Sales'] = melt.groupby([customer_id_col])[net_amount_col].shift(i)

#     melt = melt.reset_index(drop = True)

#     for i in range(ffday, lags+1):
#         melt['Last-'+str(i)+'day_Diff']  = melt.groupby([customer_id_col])['Last-'+str(i)+'day_Sales'].diff()
#     melt = melt.fillna(0)
#     return melt

# train_data = difference_in_days(train_data,7,1,'UNIQUE_CUST_ID','ORDER_CREATION_DATE','NET_AMOUNT_PER_DAY')
# test_data = difference_in_days(test_data,7,1,'UNIQUE_CUST_ID','ORDER_CREATION_DATE','NET_AMOUNT_PER_DAY')

In [150]:
train_data['RELEASED_CREDIT_VALUE'] = train_data['RELEASED_CREDIT_VALUE'].astype(float)
train_data['RELEASED_CREDIT_VALUE_LOG'] = np.where(train_data['RELEASED_CREDIT_VALUE'] > 0, np.log(train_data['RELEASED_CREDIT_VALUE']), train_data['RELEASED_CREDIT_VALUE'])
train_data['Last-1day_Sales_log'] = np.where(train_data['Last-1day_Sales'] > 0, np.log(train_data['Last-1day_Sales']), train_data['Last-1day_Sales'])
train_data['Last-2day_Sales_log'] = np.where(train_data['Last-2day_Sales'] > 0, np.log(train_data['Last-2day_Sales']), train_data['Last-2day_Sales'])
train_data['Last-3day_Sales_log'] = np.where(train_data['Last-3day_Sales'] > 0, np.log(train_data['Last-3day_Sales']), train_data['Last-3day_Sales'])
train_data['Last-4day_Sales_log'] = np.where(train_data['Last-4day_Sales'] > 0, np.log(train_data['Last-4day_Sales']), train_data['Last-4day_Sales'])
train_data['Last-5day_Sales_log'] = np.where(train_data['Last-5day_Sales'] > 0, np.log(train_data['Last-5day_Sales']), train_data['Last-5day_Sales'])
train_data['Last-6day_Sales_log'] = np.where(train_data['Last-6day_Sales'] > 0, np.log(train_data['Last-6day_Sales']), train_data['Last-6day_Sales'])
train_data['Last-7day_Sales_log'] = np.where(train_data['Last-7day_Sales'] > 0, np.log(train_data['Last-7day_Sales']), train_data['Last-7day_Sales'])


train_data['Last-1day_Sales_log'].fillna(0, inplace=True)
train_data['Last-2day_Sales_log'].fillna(0, inplace=True)
train_data['Last-3day_Sales_log'].fillna(0, inplace=True)
train_data['Last-4day_Sales_log'].fillna(0, inplace=True)
train_data['Last-5day_Sales_log'].fillna(0, inplace=True)
train_data['Last-6day_Sales_log'].fillna(0, inplace=True)
train_data['Last-7day_Sales_log'].fillna(0, inplace=True)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [151]:
test_data['RELEASED_CREDIT_VALUE'] = test_data['RELEASED_CREDIT_VALUE'].astype(float)
test_data['RELEASED_CREDIT_VALUE_LOG'] = np.where(test_data['RELEASED_CREDIT_VALUE'] > 0, np.log(test_data['RELEASED_CREDIT_VALUE']), test_data['RELEASED_CREDIT_VALUE'])
test_data['Last-1day_Sales_log'] = np.where(test_data['Last-1day_Sales'] > 0, np.log(test_data['Last-1day_Sales']), test_data['Last-1day_Sales'])
test_data['Last-2day_Sales_log'] = np.where(test_data['Last-2day_Sales'] > 0, np.log(test_data['Last-2day_Sales']), test_data['Last-2day_Sales'])
test_data['Last-3day_Sales_log'] = np.where(test_data['Last-3day_Sales'] > 0, np.log(test_data['Last-3day_Sales']), test_data['Last-3day_Sales'])
test_data['Last-4day_Sales_log'] = np.where(test_data['Last-4day_Sales'] > 0, np.log(test_data['Last-4day_Sales']), test_data['Last-4day_Sales'])
test_data['Last-5day_Sales_log'] = np.where(test_data['Last-5day_Sales'] > 0, np.log(test_data['Last-5day_Sales']), test_data['Last-5day_Sales'])
test_data['Last-6day_Sales_log'] = np.where(test_data['Last-6day_Sales'] > 0, np.log(test_data['Last-6day_Sales']), test_data['Last-6day_Sales'])
test_data['Last-7day_Sales_log'] = np.where(test_data['Last-7day_Sales'] > 0, np.log(test_data['Last-7day_Sales']), test_data['Last-7day_Sales'])



test_data['Last-1day_Sales_log'].fillna(0, inplace=True)
test_data['Last-2day_Sales_log'].fillna(0, inplace=True)
test_data['Last-3day_Sales_log'].fillna(0, inplace=True)
test_data['Last-4day_Sales_log'].fillna(0, inplace=True)
test_data['Last-5day_Sales_log'].fillna(0, inplace=True)
test_data['Last-6day_Sales_log'].fillna(0, inplace=True)
test_data['Last-7day_Sales_log'].fillna(0, inplace=True)



  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [152]:
train_data['amount_in_usd_log'] = np.where(train_data['amount_in_usd'] > 0, np.log(train_data['amount_in_usd']), train_data['amount_in_usd'])
test_data['amount_in_usd_log'] = np.where(test_data['amount_in_usd'] > 0, np.log(test_data['amount_in_usd']), test_data['amount_in_usd'])



  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [153]:
train_data['net_amount_in_usd_log'] = np.where(train_data['net_amount_in_usd'] > 0, np.log(train_data['net_amount_in_usd']), train_data['net_amount_in_usd'])
test_data['net_amount_in_usd_log'] = np.where(test_data['net_amount_in_usd'] > 0, np.log(test_data['net_amount_in_usd']), test_data['net_amount_in_usd'])


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [154]:
train_data['ORDER_CREATION_DATE'] = train_data['ORDER_CREATION_DATE'].astype(int)
test_data['ORDER_CREATION_DATE'] = test_data['ORDER_CREATION_DATE'].astype(int)

# MODELS

In [155]:

features = ['RELEASED_CREDIT_VALUE_LOG','UNIQUE_CUST_ID','EXPECTED_DELIVERY_TIME','SALES_ORG', 'DISTRIBUTION_CHANNEL', 'DIVISION', 'CREDIT_CONTROL_AREA', 'CREDIT_STATUS','Last-1day_Sales_log','Last-2day_Sales_log','Last-3day_Sales_log','Last-4day_Sales_log','Last-5day_Sales_log','Last-6day_Sales_log','Last-7day_Sales_log']

target = 'amount_in_usd_log'
X_train = train_data[features]
X_test = test_data[features]
y_train = train_data[target]
y_test = test_data[target]
r2_train_list = []
r2_test_list = []
mse_train_list = []
mse_test_list = []
rmse_train_list = []
rmse_test_list = []

In [156]:
model = LinearRegression()
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)

r2_train_list.append(r2_train)
r2_test_list.append(r2_test)
mse_train_list.append(mse_train)
mse_test_list.append(mse_test)
rmse_train_list.append(rmse_train)
rmse_test_list.append(rmse_test)

print("Train - R-squared Score:", r2_train)
print("Train - Mean Squared Error:", mse_train)
print("Test - R-squared Score:", r2_test)
print("Test - Mean Squared Error:", mse_test)

Train - R-squared Score: 0.17047885475898295
Train - Mean Squared Error: 6.097019234883232
Test - R-squared Score: 0.11760918863790326
Test - Mean Squared Error: 7.668679674648252


In [157]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

print("Train - R-squared Score:", r2_train)
print("Train - Mean Squared Error:", mse_train)
print("Test - R-squared Score:", r2_test)
print("Test - Mean Squared Error:", mse_test)

r2_train_list.append(r2_train)
r2_test_list.append(r2_test)
mse_train_list.append(mse_train)
mse_test_list.append(mse_test)
rmse_train_list.append(rmse_train)
rmse_test_list.append(rmse_test)

Train - R-squared Score: 0.999981829502772
Train - Mean Squared Error: 0.00013355400491181342
Test - R-squared Score: -0.5963397615195354
Test - Mean Squared Error: 13.873465277931324


In [None]:
train_data.columns

Index(['CUSTOMER_ORDER_ID', 'SALES_ORG', 'DISTRIBUTION_CHANNEL', 'DIVISION',
       'RELEASED_CREDIT_VALUE', 'PURCHASE_ORDER_TYPE', 'COMPANY_CODE',
       'ORDER_CREATION_DATE', 'ORDER_CREATION_TIME', 'CREDIT_CONTROL_AREA',
       'SOLD_TO_PARTY', 'ORDER_AMOUNT', 'REQUESTED_DELIVERY_DATE',
       'ORDER_CURRENCY', 'CREDIT_STATUS', 'CUSTOMER_NUMBER', 'amount_in_usd',
       'UNIQUE_CUST_ID', 'ORDER_MONTH', 'net_amount_in_usd', 'Last-1day_Sales',
       'Last-2day_Sales', 'Last-3day_Sales', 'Last-4day_Sales',
       'Last-5day_Sales', 'Last-6day_Sales', 'Last-7day_Sales',
       'Last-1day_Diff', 'Last-2day_Diff', 'Last-3day_Diff', 'Last-4day_Diff',
       'Last-5day_Diff', 'Last-6day_Diff', 'Last-7day_Diff',
       'EXPECTED_DELIVERY_TIME', 'RELEASED_CREDIT_VALUE_LOG',
       'Last-1day_Sales_log', 'Last-2day_Sales_log', 'Last-3day_Sales_log',
       'Last-4day_Sales_log', 'Last-5day_Sales_log', 'Last-6day_Sales_log',
       'Last-7day_Sales_log', 'amount_in_usd_log', 'net_amount_in_usd

In [None]:
# params = {
#     'n_estimators' : 800, #around 18-20 minutes 0.78 - 0.8
#     'random_state': 42 
# }
# model = xgb.XGBRegressor(**params) #0.7659 - 0.7898 no params
# # X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
# model.fit(X_train,y_train)
# y_train_pred = model.predict(X_train)
# y_test_pred = model.predict(X_test)

# r2_train = r2_score(y_train, y_train_pred)
# r2_test = r2_score(y_test, y_test_pred)

# mse_train = mean_squared_error(y_train, y_train_pred)
# mse_test = mean_squared_error(y_test, y_test_pred)

# print("Train - R-squared Score:", r2_train)
# print("Train - Mean Squared Error:", mse_train)
# print("Test - R-squared Score:", r2_test)
# print("Test - Mean Squared Error:", mse_test)

# r2_train_list.append(r2_train)
# r2_test_list.append(r2_test)
# mse_train_list.append(mse_train)
# mse_test_list.append(mse_test)
# rmse_train_list.append(rmse_train)
# rmse_test_list.append(rmse_test)

In [158]:
import lightgbm as lgb
train_data = lgb.Dataset(X_train, label=y_train)
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 80, #80-100
    'learning_rate': 0.05, #.06 -.07
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 6,#3-5
    'verbose': 1
}
model = lgb.train(params,train_data,num_boost_round=1000)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

print("Train - R-squared Score:", r2_train)
print("Train - Mean Squared Error:", mse_train)
print("Test - R-squared Score:", r2_test)
print("Test - Mean Squared Error:", mse_test)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2979
[LightGBM] [Info] Number of data points in the train set: 855281, number of used features: 15
[LightGBM] [Info] Start training from score 4.583310
Train - R-squared Score: 0.8147633141608892
Train - Mean Squared Error: 1.3614983090502613
Test - R-squared Score: 0.2936089058455621
Test - Mean Squared Error: 6.139101808792215


In [None]:
print("Train - R-squared Score:", r2_train)
print("Train - Mean Squared Error:", mse_train)
print("Test - R-squared Score:", r2_test)
print("Test - Mean Squared Error:", mse_test)

Train - R-squared Score: 0.7815425261728309
Train - Mean Squared Error: 1.6056726553260499
Test - R-squared Score: 0.4208449204736099
Test - Mean Squared Error: 5.033319397305885


Train - R-squared Score: 0.8474059308029369<br>
Train - Mean Squared Error: 1.0296064647212406<br>
Test - R-squared Score: 0.8224336004045232<br>
Test - Mean Squared Error: 1.2411536841059796<br>

In [None]:
from statsmodels.tsa.arima.model import ARIMA
from pmdarima.arima import auto_arima
data = train_data
data['ORDER_CREATION_DATE'] = pd.to_datetime(data['ORDER_CREATION_DATE'])

data = data.set_index('ORDER_CREATION_DATE')


time_series_data = data["amount_in_usd"]


max_p = 5
max_d = 2
max_q = 5


# Use auto_arima to determine the optimal values of p, d, and q
model = auto_arima(time_series_data, start_p=1, d=None, start_q=1,
                   max_p=max_p, max_d=max_d, max_q=max_q,
                   seasonal=False, trace=True, error_action='ignore',
                   suppress_warnings=True, stepwise=True)

# Print the optimal values of p, d, and q
print("Optimal values: p={}, d={}, q={}".format(model.order[0], model.order[1], model.order[2]))


TypeError: 'Dataset' object is not subscriptable