In [1]:
# Parameters
relative_location = "/opt/airflow/Projects/cryptobay/"


In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, Ridge
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
#import  matplotlib.pyplot as plt
import pickle

In [3]:
#USED FOR AIRFLOW
try:
    relative_location
except NameError:
    relative_location = ""

In [4]:
# RAW DF ingestion
columns = ['transaction_id', 'ship_id', 'ship_class', 'ship_durability', 'ship_owner', \
        'ship_attr_space', 'ship_attr_speed', 'ship_attr_skill', 'ship_attr_defence', 'ship_attr_attack', 'ship_attr_morale', \
        'ship_parts_keel', 'ship_parts_sail', 'ship_parts_side', 'ship_parts_bow', 'ship_parts_cabin', 'ship_parts_stern', \
        'ship_sold_price']
raw_df = pd.read_csv(f"{relative_location}db/extracted_sold_USD.csv", names=columns)
raw_df_bnb = pd.read_csv(f"{relative_location}db/extracted_sold_BNB.csv", names=columns)
##

raw_df = raw_df.drop_duplicates(subset=['transaction_id', 'ship_id'], keep='last').reset_index()
raw_df_bnb = raw_df_bnb.drop_duplicates(subset=['transaction_id', 'ship_id'], keep='last').reset_index()

In [5]:
#GET ONLY LAST 500
raw_df = raw_df.tail(1000)
raw_df_bnb = raw_df_bnb.tail(1000)

In [6]:
# CLEAN UP 1 AND UP TO 99 PERCENTILE
raw_df = raw_df[(raw_df['ship_sold_price'] >= raw_df['ship_sold_price'].quantile(0.01)) & (raw_df['ship_sold_price'] <= raw_df['ship_sold_price'].quantile(0.99))]
raw_df_bnb = raw_df_bnb[(raw_df_bnb['ship_sold_price'] >= raw_df_bnb['ship_sold_price'].quantile(0.01)) & (raw_df_bnb['ship_sold_price'] <= raw_df_bnb['ship_sold_price'].quantile(0.99))]

In [7]:
wo_cat_columns = ['ship_attr_space','ship_attr_speed','ship_attr_skill','ship_attr_defence','ship_attr_attack','ship_attr_morale', 'ship_sold_price']

wo_cat_df = raw_df[wo_cat_columns]
wo_cat_df_bnb = raw_df_bnb[wo_cat_columns]
#wo_cat_df.head()

wo_cat_X = wo_cat_df.drop('ship_sold_price', axis = 1)
wo_cat_X_bnb = wo_cat_df_bnb.drop('ship_sold_price', axis = 1)
wo_cat_y = wo_cat_df['ship_sold_price']
wo_cat_y_bnb = wo_cat_df_bnb['ship_sold_price']

In [8]:
# Training of the data
X_train, X_test, y_train, y_test = train_test_split(wo_cat_X, wo_cat_y, test_size=0.3, random_state=42)
reg = LinearRegression()
reg.fit(X_train, y_train)

X_train_bnb, X_test_bnb, y_train_bnb, y_test_bnb = train_test_split(wo_cat_X_bnb, wo_cat_y_bnb, test_size=0.3, random_state=42)
reg_bnb = LinearRegression()
reg_bnb.fit(X_train_bnb, y_train_bnb)

LinearRegression()

In [9]:
# Predict on TEST dataset
y_pred = reg.predict(X_test)
y_pred_bnb = reg_bnb.predict(X_test_bnb)
reg_bnb.score(X_test_bnb, y_test_bnb)

0.3323414740525271

In [10]:
# zztest = np.array([5,13,5,6,11,5])
# zztest = zztest.reshape(1, -1)

# lasso_usd.predict(zztest)

In [11]:
reg_bnb.fit(X_train_bnb, y_train_bnb).coef_

array([0.00795914, 0.00891192, 0.00628072, 0.00613043, 0.00886067,
       0.00585652])

In [12]:
# LINIAR REGRESSION
cv_scores = cross_val_score(reg, wo_cat_X, wo_cat_y, cv=5)
cv_scores_bnb = cross_val_score(reg_bnb, wo_cat_X_bnb, wo_cat_y_bnb, cv=5)

np.mean(cv_scores)
#np.mean(cv_scores_bnb)

0.4392831691223599

In [13]:
np.sqrt(mean_squared_error(y_test, y_pred))

21.441694538318803

In [14]:
# # USING L2 REGULARIZATION - ridge

# ridge = Ridge(alpha=0.1, normalize=True)
# ridge_coef = ridge.fit(X_train, y_train).coef_
# ridge_pred = ridge.predict(X_test)
# ridge.score(X_test, y_test)

# cv_scores_ridge = cross_val_score(ridge, wo_cat_X, wo_cat_y, cv=5)
# np.mean(cv_scores_ridge)


In [15]:
# np.sqrt(mean_squared_error(y_test, ridge_pred))

In [16]:
# USING L1 regularization USD

lasso_usd = Lasso(alpha=0.6)
lasso_usd_coef = lasso_usd.fit(X_train, y_train).coef_
lasso_pred_usd = lasso_usd.predict(X_test)

lasso_usd.score(X_test, y_test)

cv_scores_lasso_usd = cross_val_score(lasso_usd, wo_cat_X, wo_cat_y, cv=5)
np.mean(cv_scores_lasso_usd)


0.43951535871356723

In [17]:
lasso_usd_coef

array([2.20960586, 2.44328838, 1.90630118, 2.00842144, 2.81724383,
       1.79418332])

In [18]:
# USING L1 regularization BNB

# lasso_bnb = Lasso(alpha=1)
# lasso_bnb_coef = lasso_bnb.fit(X_train_bnb, y_train_bnb).coef_
# lasso_pred_bnb = lasso_bnb.predict(X_test_bnb)

# lasso_bnb.score(X_test_bnb, y_test_bnb)

# cv_scores_lasso_bnb = cross_val_score(lasso_bnb, wo_cat_X_bnb, wo_cat_y_bnb, cv=5)
# np.mean(cv_scores_lasso_bnb)

In [19]:
#np.sqrt(mean_squared_error(y_test, lasso_pred_usd))

In [20]:
# Check the coefficient for penalization

# names = wo_cat_df.drop('ship_sold_price', axis=1).columns
# _ = plt.plot(range(len(names)), ridge_coef)
# _ = plt.xticks(range(len(names)), names, rotation=60)
# _ = plt.ylabel('Coefficients')
# plt.show()

In [21]:
# SINGLE PREDICTION
# zztest = np.array([8,8,11,8,5,8])
# zztest = np.array([5,13,15,6,11,5])
# zztest = zztest.reshape(1, -1)

# lasso_usd.predict(zztest)
## actual 34

In [22]:
raw_df_bnb[['ship_id', 'ship_sold_price']].groupby(['ship_id'], as_index=False).agg(['count', 'mean']).sort_values(('ship_sold_price', 'count'), ascending=False)

Unnamed: 0_level_0,ship_sold_price,ship_sold_price
Unnamed: 0_level_1,count,mean
ship_id,Unnamed: 1_level_2,Unnamed: 2_level_2
2433,4,0.090000
13546,4,0.087500
11715,3,0.266667
27993,3,0.093333
8991,3,0.123333
...,...,...
12493,1,0.150000
12511,1,0.150000
12512,1,0.190000
12629,1,0.320000


In [23]:
# times_sold = raw_df[['ship_id', 'ship_sold_price']].groupby('ship_id')['ship_sold_price'].apply(list).reset_index(name='ship_sold_price').agg({'ship_sold_price': lambda x: len(x)})

# new_df = raw_df[['ship_id', 'ship_sold_price']].groupby('ship_id')['ship_sold_price'].apply(list).reset_index(name='ship_sold_price')
# new_df['times_sold'] = times_sold
# new_df.sort_values('times_sold', ascending=False, inplace=True)

# multiple_sold = new_df[new_df['times_sold'] > 1]

# multiple_sold

In [24]:
# SAVE MODEL

pickle.dump(lasso_usd, open(f"{relative_location}db/model_USD.sav", 'wb'))
pickle.dump(reg_bnb, open(f"{relative_location}db/model_BNB.sav", 'wb'))

