In [20]:
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.cbook import boxplot_stats
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_regression
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
df = pd.read_csv("Final.csv")
df.fillna("NaN", inplace=True)
df["ORDER_CREATION_DATE"] = pd.to_datetime(df["ORDER_CREATION_DATE"], format="%Y%m%d")
df["REQUESTED_DELIVERY_DATE"] = pd.to_datetime(df["REQUESTED_DELIVERY_DATE"], format="%Y%m%d")
df.drop(df[(df["ORDER_CREATION_DATE"] > df["REQUESTED_DELIVERY_DATE"])].index, inplace=True)
df["ORDER_AMOUNT"] = df["ORDER_AMOUNT"].str.replace("-", "")
df["ORDER_AMOUNT"] = df["ORDER_AMOUNT"].str.replace(",", ".")
df["RELEASED_CREDIT_VALUE"] = df["RELEASED_CREDIT_VALUE"].str.replace("-", "")
df["RELEASED_CREDIT_VALUE"] = df["RELEASED_CREDIT_VALUE"].str.replace(",", ".")
df["ORDER_CURRENCY"] = df["ORDER_CURRENCY"].replace("HU1", "HUF")
apikey = "347adc49463e4adfafd55bba3192ed32"
url = f"https://openexchangerates.org/api/latest.json?app_id={apikey}"
response = requests.get(url)
data = response.json()
exchange_rates = data["rates"]
def convert_to_usd(row):
    amount = row['ORDER_AMOUNT']
    currency = row['ORDER_CURRENCY']
    if currency != 'USD':
        return float(amount) / exchange_rates.get(currency, 1)
    return amount
df['amount_in_usd'] = df.apply(convert_to_usd, axis=1)
df["UNIQUE_CUST_ID"] = df["CUSTOMER_NUMBER"].astype(str) + df["COMPANY_CODE"].astype(str)
df.sort_values('ORDER_CREATION_DATE',inplace=True)
df['RELEASED_CREDIT_VALUE'] = df['RELEASED_CREDIT_VALUE'].astype(float)
df['amount_in_usd']=df['amount_in_usd'].astype(float)
df['DISTRIBUTION_CHANNEL'] = df['DISTRIBUTION_CHANNEL'].astype(str)
df['DIVISION'] = df['DIVISION'].astype(str)
df['PURCHASE_ORDER_TYPE'] = df['PURCHASE_ORDER_TYPE'].astype(str)
df['CREDIT_CONTROL_AREA'] = df['CREDIT_CONTROL_AREA'].astype(str)
df['CREDIT_STATUS'] = df['CREDIT_STATUS'].astype(str)
df['ORDER_MONTH'] = df['ORDER_CREATION_DATE'].dt.month
df['ORDER_DATE'] = df['ORDER_CREATION_DATE'].dt.day
monthly_data = {}
for month in range(1, 13):
    monthly_data[month] = df[df['ORDER_MONTH'] == month]
train_data = pd.concat([monthly_data[1],monthly_data[2],monthly_data[3],monthly_data[4]], ignore_index=True)
test_data = pd.concat([monthly_data[5],monthly_data[6]], ignore_index=True)

Q1 = train_data.amount_in_usd.astype(float).quantile(0.25)
Q3 = train_data.amount_in_usd.astype(float).quantile(0.75)
IQR = Q3-Q1
lb = (Q1-2.2*IQR)
ub = (Q3+2.2*IQR)

print(str(lb)+" "+str(ub))
mean_replace = np.mean(train_data[~((train_data.amount_in_usd.astype(float) < lb) | (train_data.amount_in_usd.astype(float) > ub))]['amount_in_usd'].astype(float))
train_data['amount_in_usd'].loc[(train_data['amount_in_usd'] < lb) | (train_data['amount_in_usd'] > ub)]=ub

def difference_in_days(melt, lags, ffday, customer_id_col, create_date_col, net_amount_col):
    for i in range(ffday, lags+1):
        melt['Last-'+str(i)+'day_Sales'] = melt.groupby([customer_id_col])[net_amount_col].shift(i)

    melt = melt.reset_index(drop = True)

    for i in range(ffday, lags+1):
        melt['Last-'+str(i)+'day_Diff']  = melt.groupby([customer_id_col])['Last-'+str(i)+'day_Sales'].diff()
    melt = melt.fillna(0)
    return melt

train_data = difference_in_days(train_data,7,1,'UNIQUE_CUST_ID','ORDER_CREATION_DATE','amount_in_usd')
test_data = difference_in_days(test_data,7,1,'UNIQUE_CUST_ID','ORDER_CREATION_DATE','amount_in_usd')

train_data['EXPECTED_DELIVERY_TIME'] = (train_data['REQUESTED_DELIVERY_DATE'] - train_data['ORDER_CREATION_DATE']).dt.days
test_data['EXPECTED_DELIVERY_TIME'] = (test_data['REQUESTED_DELIVERY_DATE'] - test_data['ORDER_CREATION_DATE']).dt.days

categorical_columns = ['SALES_ORG', 'DISTRIBUTION_CHANNEL', 'DIVISION', 'CREDIT_CONTROL_AREA', 'CREDIT_STATUS','UNIQUE_CUST_ID']

le = LabelEncoder()
for column in categorical_columns:
    train_data[column] = le.fit_transform(train_data[column])
    dic = dict(zip(le.classes_, le.transform(le.classes_)))
    test_data[column]=test_data[column].map(dic).fillna(7000)

train_data['RELEASED_CREDIT_VALUE_LOG'] = np.where(train_data['RELEASED_CREDIT_VALUE'] > 0, np.log(train_data['RELEASED_CREDIT_VALUE']), train_data['RELEASED_CREDIT_VALUE'])
train_data['Last-1day_Sales_log'] = np.where(train_data['Last-1day_Sales'] > 0, np.log(train_data['Last-1day_Sales']), train_data['Last-1day_Sales'])
train_data['Last-2day_Sales_log'] = np.where(train_data['Last-2day_Sales'] > 0, np.log(train_data['Last-2day_Sales']), train_data['Last-2day_Sales'])
train_data['Last-3day_Sales_log'] = np.where(train_data['Last-3day_Sales'] > 0, np.log(train_data['Last-3day_Sales']), train_data['Last-3day_Sales'])
train_data['Last-4day_Sales_log'] = np.where(train_data['Last-4day_Sales'] > 0, np.log(train_data['Last-4day_Sales']), train_data['Last-4day_Sales'])
train_data['Last-5day_Sales_log'] = np.where(train_data['Last-5day_Sales'] > 0, np.log(train_data['Last-5day_Sales']), train_data['Last-5day_Sales'])
train_data['Last-6day_Sales_log'] = np.where(train_data['Last-6day_Sales'] > 0, np.log(train_data['Last-6day_Sales']), train_data['Last-6day_Sales'])
train_data['Last-7day_Sales_log'] = np.where(train_data['Last-7day_Sales'] > 0, np.log(train_data['Last-7day_Sales']), train_data['Last-7day_Sales'])


train_data['Last-1day_Sales_log'].fillna(0, inplace=True)
train_data['Last-2day_Sales_log'].fillna(0, inplace=True)
train_data['Last-3day_Sales_log'].fillna(0, inplace=True)
train_data['Last-4day_Sales_log'].fillna(0, inplace=True)
train_data['Last-5day_Sales_log'].fillna(0, inplace=True)
train_data['Last-6day_Sales_log'].fillna(0, inplace=True)
train_data['Last-7day_Sales_log'].fillna(0, inplace=True)

test_data['RELEASED_CREDIT_VALUE_LOG'] = np.where(test_data['RELEASED_CREDIT_VALUE'] > 0, np.log(test_data['RELEASED_CREDIT_VALUE']), test_data['RELEASED_CREDIT_VALUE'])
test_data['Last-1day_Sales_log'] = np.where(test_data['Last-1day_Sales'] > 0, np.log(test_data['Last-1day_Sales']), test_data['Last-1day_Sales'])
test_data['Last-2day_Sales_log'] = np.where(test_data['Last-2day_Sales'] > 0, np.log(test_data['Last-2day_Sales']), test_data['Last-2day_Sales'])
test_data['Last-3day_Sales_log'] = np.where(test_data['Last-3day_Sales'] > 0, np.log(test_data['Last-3day_Sales']), test_data['Last-3day_Sales'])
test_data['Last-4day_Sales_log'] = np.where(test_data['Last-4day_Sales'] > 0, np.log(test_data['Last-4day_Sales']), test_data['Last-4day_Sales'])
test_data['Last-5day_Sales_log'] = np.where(test_data['Last-5day_Sales'] > 0, np.log(test_data['Last-5day_Sales']), test_data['Last-5day_Sales'])
test_data['Last-6day_Sales_log'] = np.where(test_data['Last-6day_Sales'] > 0, np.log(test_data['Last-6day_Sales']), test_data['Last-6day_Sales'])
test_data['Last-7day_Sales_log'] = np.where(test_data['Last-7day_Sales'] > 0, np.log(test_data['Last-7day_Sales']), test_data['Last-7day_Sales'])



test_data['Last-1day_Sales_log'].fillna(0, inplace=True)
test_data['Last-2day_Sales_log'].fillna(0, inplace=True)
test_data['Last-3day_Sales_log'].fillna(0, inplace=True)
test_data['Last-4day_Sales_log'].fillna(0, inplace=True)
test_data['Last-5day_Sales_log'].fillna(0, inplace=True)
test_data['Last-6day_Sales_log'].fillna(0, inplace=True)
test_data['Last-7day_Sales_log'].fillna(0, inplace=True)

train_data['amount_in_usd_log'] = np.where(train_data['amount_in_usd'] > 0, np.log(train_data['amount_in_usd']), train_data['amount_in_usd'])
test_data['amount_in_usd_log'] = np.where(test_data['amount_in_usd'] > 0, np.log(test_data['amount_in_usd']), test_data['amount_in_usd'])

features = ['UNIQUE_CUST_ID','ORDER_MONTH','RELEASED_CREDIT_VALUE_LOG','EXPECTED_DELIVERY_TIME','SALES_ORG','CREDIT_STATUS','DISTRIBUTION_CHANNEL', 'DIVISION', 'CREDIT_CONTROL_AREA','Last-1day_Sales_log','Last-2day_Sales_log','Last-3day_Sales_log','Last-4day_Sales_log','Last-5day_Sales_log','Last-6day_Sales_log','Last-7day_Sales_log']

target = 'amount_in_usd_log'
X_train = train_data[features]
X_test = test_data[features]
y_train = train_data[target]
y_test = test_data[target]

import xgboost as xgb
params = {
    'n_estimators' : 300,
    'learning_rate': 0.01,
    'random_state': 42 
}
model = xgb.XGBRegressor(**params)
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

print("Train - R-squared Score:", r2_train)
print("Train - Mean Squared Error:", mse_train)
print("Test - R-squared Score:", r2_test)
print("Test - Mean Squared Error:", mse_test)
