### Importing and Loading the Data from pickle files

In [1]:
#General Imports
import os 
import sys
from dotenv import load_dotenv #importing env file
import pandas as pd
import random
import pickle



In [2]:
#ML Imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [3]:
#NLP
import gensim
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [4]:
train_path = '/Users/vishesh/projects/Pricer/data/balanced/train.pkl'
test_path = '/Users/vishesh/projects/Pricer/data/balanced/test.pkl'
with open(train_path,'rb') as f:
    train = pickle.load(f)
with open(test_path,'rb') as f:
    test = pickle.load(f)

In [5]:
print(train['title'].iloc[2344])

Sid Meier's Civilization Revolution - Playstation 3


In [6]:

print(train['prompt'].iloc[2344])
print(test['test_prompt'].iloc[2344])

How much does this cost to the nearest dollar?

Sid Meier's Civilization Revolution - Playstation 3
Product Description In Civilization, players strive to become ruler of the world by establishing and leading a civilization from the dawn of man into the space age - waging war, conducting diplomacy, discovering technologies, going head-to-head with some of historys greatest leaders, and building the most powerful empire the world has ever known. Sid Meier's Civilization Revolution is a watershed game, offering players a chance to experience the epic empire-building world of Civilization in an all new accessible, visually immersive, and action-packed world specifically designed for the console and handheld gamer. Delivering Civilizations renowned epic single-player campaigns featuring vast re-playability and unmatched addictive gameplay as well as revolutionary features like real-time interaction with leaders and advisors, extensive multiplayer capabilities and integrated video and voice

### TESTER CLASS

In [7]:
import math 

In [8]:
# COLOR MAP
GREEN = "\033[92m"
ORANGE = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"red":RED, "orange": ORANGE, "green": GREEN}
class Tester :

    def __init__(self,predictor , title=None , data=test , size = 250):
        self.predictor = predictor 
        self.data = data 
        self.title = title or predictor.__name__.replace("_"," ").title()
        self.size = size 
        self.guesses = []
        self.truths = []
        self.errors = []
        self.lche = []
        self.sles= []
        self.colors = []

    def run_datapoint(self , i):
        datapoint = self.data.iloc[i]

        guess = float(self.predictor(datapoint)) #predicted output
        truth = float(datapoint['price']) #always be positive

        error = abs(truth - guess)
        log_error = math.log(truth+1) - math.log(guess+1)
        sle = log_error ** 2
        log_cosh_error = self.safe_log_cosh(error)

        color = self.color_for(error , truth) #for better outputs
        title = datapoint['title'] if len(datapoint['title']) <= 40 else datapoint['title'][:40] + '...'
        
        self.guesses.append(guess)
        self.truths.append(truth)

        self.errors.append(error)
        self.colors.append(color)
        self.lche.append(log_cosh_error)
        self.sles.append(sle)

        print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")
    
    def safe_log_cosh(self,x):
        """avoids overflow"""
        x = max(min(x, 500), -500)  # Cap between -500 and 500
        return math.log(math.cosh(x))

    def color_for(self , error , truth):
        if error < 40 or error/truth < 0.2:
            return 'green'
        elif error < 80 or error/truth < 0.4:
            return 'orange'
        else :
            return 'red'
        
    def chart(self,title):
        max_error = max(self.errors)
        plt.figure(figsize=(12,8))
        max_val = max(max(self.truths),max(self.guesses))
        plt.plot([0,max_val],[0,max_val],color='skyblue' , lw=2 , alpha=0.6)
        plt.scatter(self.truths,self.guesses,s=3,c=self.colors)
        plt.xlabel('True Values')
        plt.ylabel('Guess Values by Model')
        plt.xlim(0,max_val)
        plt.ylim(0,max_val)
        plt.title(title)
        plt.show()
    
    def report(self):
        average_error = sum(self.errors) / self.size 
        rmsle = math.sqrt(sum(self.sles)/self.size)
        HIT = sum(1 for color in self.colors if color =='green')
        title = f"{self.title} Error=${average_error:,.2f}  RMSLE={rmsle:,.2f}  HIT={HIT/self.size*100:.1f}%"
        self.chart(title)
    
    def run (self):
        self.error = 0 
        for i in range (self.size):
            self.run_datapoint(i)
        self.report()
    
    @classmethod
    def test(cls , function):
        cls(function).run()


### Basic testing

In [None]:
random.seed(42)

# 2. Define any predictor function
def random_pricer(item):
    return random.randrange(1, 1000)

# 3. Test any function
Tester.test(random_pricer)



In [None]:
#guessing average 
average = train['price'].mean()

def average_pricer(item):
    return average
Tester.test(average_pricer)

In [11]:
train.keys()

Index(['title', 'price', 'category', 'test_prompt', 'prompt'], dtype='object')

In [12]:
'''
Training is missing the raw details so , 
I processed the data again this time including the raw details 
stored in data Raw
'''

'\nTraining is missing the raw details so , \nI processed the data again this time including the raw details \nstored in data Raw\n'

In [13]:
train_details_path = '/Users/vishesh/projects/Pricer/data/raw/train_details.pkl'
test_details_path = '/Users/vishesh/projects/Pricer/data/raw/test_details.pkl'
with open(train_details_path,'rb') as f:
    train_details = pickle.load(f)
with open(test_details_path , 'rb') as f :
    test_details = pickle.load(f)

In [14]:
test_details[234].details

'{"Brand": "Bostingner", "Color": "Matte Black", "Material": "Stainless Steel, Brass, Metal", "Finish Type": "\\u200eBrushed", "Number of Handles": "1", "Handle Material": "Brass", "Mounting Type": "Ceiling Mount", "Included Components": "1 \\u00d7 Shower Holder (Brass), 1 \\u00d7 User Manual, 1 \\u00d7 Pressure Balance Valve (Brass), 1 \\u00d7 60-inch Shower Hose (304 stainless steel), 1 \\u00d7 Shower Arm (Brass), 1 \\u00d7 Handheld Shower Head (Brass), 1 \\u00d7 10-inch Square Rain Shower Head (304 stainless steel)", "Flow Rate Description": "2.5 Gallons Per Minute", "Manufacturer": "Bostingner", "Part Number": "10 INCH SHOWER SYSTEM", "Item Weight": "11.92 pounds", "Product Dimensions": "17.32 x 13.46 x 4.72 inches", "Country of Origin": "China", "Item model number": "BST2202DH-C", "Size": "10 INCH SHOWER SYSTEM", "Style": "Ceiling Mount", "Finish": "Brushed", "Pattern": "10 Inch Matte Black Shower System Ceiling Mounted", "Shape": "Square", "Installation Method": "Ceiling-Mounted"

### Feature Engineering

In [15]:
#converting string to dictionary for extracting data/features easily using json 
#in a new features field populated with json from details dict
import json
for detail in train_details:
    detail.features = json.loads(detail.details)
for detail in test_details:
    detail.features = json.loads(detail.details)

train_details[0].features

{'Brand': 'Delphi',
 'Fit Type': 'Vehicle Specific Fit',
 'Item Dimensions LxWxH': '19.7 x 7.7 x 5.1 inches',
 'Item Weight': '2.2 Pounds',
 'Auto Part Position': 'Unknown',
 'Operation Mode': 'Mechanical',
 'Manufacturer': 'Delphi',
 'Model': 'FUEL PUMP',
 'Product Dimensions': '19.7 x 7.7 x 5.1 inches',
 'Country of Origin': 'USA',
 'Item model number': 'FG0166',
 'Is Discontinued By Manufacturer': 'No',
 'Exterior': 'Painted',
 'Manufacturer Part Number': 'FG0166',
 'OEM Part Number': '25326856, 89060639, A30416, E4061R, MU242',
 'Best Sellers Rank': {'Automotive': 913571,
  'Automotive Replacement Electric Fuel Pumps': 6568},
 'Domestic Shipping': 'Item can be shipped within U.S.',
 'International Shipping': 'This item is not eligible for international shipping.  Learn More',
 'Date First Available': 'August 9, 2006'}

In [16]:
#getting the most common features 
from collections import Counter
feature_count = Counter()
for item in train_details:
    for f in item.features.keys():
        feature_count[f]+=1
feature_count.most_common(20)

[('Date First Available', 360062),
 ('Item Weight', 350444),
 ('Manufacturer', 349357),
 ('Brand', 340323),
 ('Best Sellers Rank', 330473),
 ('Item model number', 277573),
 ('Product Dimensions', 262400),
 ('Color', 183794),
 ('Is Discontinued By Manufacturer', 169828),
 ('Manufacturer Part Number', 145092),
 ('Material', 137447),
 ('Country of Origin', 121451),
 ('Style', 77639),
 ('Special Feature', 74709),
 ('Part Number', 74529),
 ('Batteries Required?', 73595),
 ('Package Dimensions', 72900),
 ('Item Dimensions LxWxH', 71385),
 ('Included Components', 67509),
 ('Model', 65144)]

In [17]:
#I believe Item Weight Manufacturer brand and Sellers Rank can be useful 
#diving into each categories for better knowledge 

In [18]:
brands = Counter() 
for item in train_details:
    brand = item.features.get('Brand') 
    if brand :
        brands[brand] +=1
brands.most_common(40)

[('HP', 5500),
 ('Power Stop', 3594),
 ('Dell', 3254),
 ('Detroit Axle', 2734),
 ('Lenovo', 2441),
 ('Dorman', 2116),
 ('SAMSUNG', 1772),
 ('BUYAUTOPARTS!', 1715),
 ('ACDelco', 1609),
 ('Evan Fischer', 1415),
 ('ASUS', 1387),
 ('Sony', 1282),
 ('Canon', 1201),
 ('Callahan BRAKE PARTS', 1154),
 ('Kohler', 1152),
 ('CURT', 1120),
 ('R1 Concepts', 1054),
 ('Rareelectrical', 966),
 ('Coverking', 941),
 ('Garage-Pro', 888),
 ('Kingston Brass', 852),
 ('Spectra Premium', 834),
 ('Moen', 829),
 ('Auto Dynasty', 824),
 ('WeatherTech', 822),
 ('DELTA FAUCET', 811),
 ('Generic', 793),
 ('Apple', 783),
 ('Cardone', 765),
 ('APS', 763),
 ('K&N', 758),
 ('GM', 743),
 ('Walker', 732),
 ('EBC Brakes', 717),
 ('AKKON', 646),
 ('SPEC-D TUNING', 626),
 ('TYC', 626),
 ('Covercraft', 618),
 ('Intel', 610),
 ('A-Premium', 607)]

In [19]:
#looking the brands it seems like top brands are related to electronics 
TOP_ELECTRONICS_BRANDS = ['hp','dell','lenovo','samsung','asus','sony','canon','apple','intel']

PREMIUM_ELECTRONICS_BRANDS = ['apple', 'sony', 'canon', 'samsung']

TOP_AUTO_PARTS_BRANDS = ['power stop', 'detroit axle', 'dorman', 'buyautoparts!', 'acdelco', 
                         'evan fischer', 'callahan brake parts', 'r1 concepts', 'rareelectrical',
                         'garage-pro', 'spectra premium', 'auto dynasty', 'cardone', 'aps', 
                         'gm', 'walker', 'ebc brakes', 'akkon', 'spec-d tuning', 'tyc', 'a-premium']

TOP_AUTO_ACCESSORIES_BRANDS = ['curt', 'coverking', 'weathertech', 'covercraft', 'k&n']

def is_top_electronics_brand(item):
    brand = item.features.get("Brand")
    return brand and brand.lower() in TOP_ELECTRONICS_BRANDS

def is_premium_electronics_brand(item):
    brand = item.features.get("Brand")
    return brand and brand.lower() in PREMIUM_ELECTRONICS_BRANDS

def is_top_auto_parts_brand(item):
    brand = item.features.get("Brand")
    return brand and brand.lower() in TOP_AUTO_PARTS_BRANDS

def is_top_auto_accessories_brand(item):
    brand = item.features.get("Brand")
    return brand and brand.lower() in TOP_AUTO_ACCESSORIES_BRANDS


In [20]:
#focusing on weight of item now 


#I see the unit of weight is not constant ( I want to stick to the grams convention)
def get_weight(item):
    weight_str = item.features.get('Item Weight')
    if weight_str:
        parts = weight_str.split(' ')
        amount = float(parts[0])
        unit = parts[1].lower()
        
        if unit == "pounds":
            return amount * 453.592  #pounds to g
        elif unit == "ounces":
            return amount * 28.3495  #ounces to g
        elif unit == "grams":
            return amount  # in g
        elif unit == "milligrams":
            return amount / 1000  #mg to g
        elif unit == "kilograms":
            return amount * 1000  #kg to g
        elif unit == "hundredths" and parts[2].lower() == "pounds":
            return (amount / 100) * 453.592  #hundredths of pounds to g
        else:
            print(weight_str)
    return None

In [21]:
#storing weights
weights = [get_weight(item) for item in train_details]
weights = [w for w in weights if w] #removing duplicates or empty 

In [22]:
average_weight = sum(weights)/len(weights)
print(f"{average_weight:,.2f}")

#filling the empty weights with average weight 
def get_weight_with_default(item):
    weight = get_weight(item)
    return weight or average_weight

6,155.49


In [23]:
#focusing on rank 
# single product has multiple ranks(in different product categories)
# so taking average of all of them 
def get_rank(item):
    rank_dict = item.features.get("Best Sellers Rank")
    if rank_dict:
        ranks = rank_dict.values()
        return sum(ranks)/len(ranks)
    return None 

ranks=[get_rank(item) for item in train_details]
ranks = [r for r in ranks if r]
average_rank = sum(ranks)/len(ranks)
print(f"{average_rank:,.2f}")

380,638.26


In [24]:
#filling empty with average rank
def get_rank_with_default(item):
    rank = get_rank(item)
    return rank or average_rank

In [25]:
def get_text_length(item):
    return len(item.test_prompt())

In [26]:
#getting a features dictionary 
def get_features(item):
    return {
        'weight':get_weight_with_default(item) ,
        'rank':get_rank_with_default(item) ,
        'is_top_auto_parts_brand' : 1 if is_top_auto_parts_brand(item) else 0 ,
        'is_premium_electronics_brand': 1 if is_premium_electronics_brand(item) else 0 ,
        'is_top_electronics_brand': 1 if is_top_electronics_brand(item) else 0 ,
        'is_top_auto_accessories_brand': 1 if is_top_auto_accessories_brand(item) else 0 ,
        'text_length': get_text_length(item) ,
    }

In [27]:
get_features(train_details[35])

{'weight': 14605.662400000001,
 'rank': 964.6666666666666,
 'is_top_auto_parts_brand': 0,
 'is_premium_electronics_brand': 1,
 'is_top_electronics_brand': 1,
 'is_top_auto_accessories_brand': 0,
 'text_length': 738}

In [28]:
#converting to df 
def list_to_df(items):
    features = [get_features(item) for item in items]
    df = pd.DataFrame(features)
    df['price'] = [item.price for item in items]
    return df

train_details_df = list_to_df(train_details)
test_details_df = list_to_df(test_details)


### Baseline Models

#### Need to have another tester class which tests on the list of the data I have provided 

In [29]:
class Tester_baseline:

    def __init__(self, predictor, title=None, data=test_details, size=250):
        self.predictor = predictor
        self.data = data
        self.title = title or predictor.__name__.replace("_", " ").title()
        self.size = size
        self.guesses = []
        self.truths = []
        self.errors = []
        self.sles = []
        self.colors = []

    def color_for(self, error, truth):
        if error<40 or error/truth < 0.2:
            return "green"
        elif error<80 or error/truth < 0.4:
            return "orange"
        else:
            return "red"
    
    def run_datapoint(self, i):
        datapoint = self.data[i]
        guess = self.predictor(datapoint)
        truth = datapoint.price
        error = abs(guess - truth)
        log_error = math.log(truth+1) - math.log(guess+1)
        sle = log_error ** 2
        color = self.color_for(error, truth)
        title = datapoint.title if len(datapoint.title) <= 40 else datapoint.title[:40]+"..."
        self.guesses.append(guess)
        self.truths.append(truth)
        self.errors.append(error)
        self.sles.append(sle)
        self.colors.append(color)
        print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")

    def chart(self, title):
        max_error = max(self.errors)
        plt.figure(figsize=(12, 8))
        max_val = max(max(self.truths), max(self.guesses))
        plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)
        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
        plt.xlabel('Ground Truth')
        plt.ylabel('Model Estimate')
        plt.xlim(0, max_val)
        plt.ylim(0, max_val)
        plt.title(title)
        plt.show()

    def report(self):
        average_error = sum(self.errors) / self.size
        rmsle = math.sqrt(sum(self.sles) / self.size)
        hits = sum(1 for color in self.colors if color=="green")
        title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
        self.chart(title)

    def run(self):
        self.error = 0
        for i in range(self.size):
            self.run_datapoint(i)
        self.report()

    @classmethod
    def test(cls, function):
        cls(function).run()

#### Linear Regression

In [30]:
np.random.seed(42)

train_details_df.keys()

Index(['weight', 'rank', 'is_top_auto_parts_brand',
       'is_premium_electronics_brand', 'is_top_electronics_brand',
       'is_top_auto_accessories_brand', 'text_length', 'price'],
      dtype='object')

In [31]:
feature_columns = ['weight', 'rank', 'is_top_auto_parts_brand',
       'is_premium_electronics_brand', 'is_top_electronics_brand',
       'is_top_auto_accessories_brand', 'text_length']

X_train = train_details_df[feature_columns]
Y_train = train_details_df['price']
x_test =test_details_df[feature_columns]
y_test =test_details_df['price']

model= LinearRegression()
model.fit(X_train , Y_train)

for feature , coef in zip(feature_columns , model.coef_):
    print(f"{feature}:{coef}")
print(f"Intercept :{model.intercept_}")

y_pred = model.predict(x_test)
mse = mean_squared_error(y_test , y_pred)
r2=r2_score(y_test , y_pred)

print(f"RMSE :{(mse)**0.5}")
print(f"R-squared Score :{r2}")

weight:0.0005819372506698307
rank:5.1911328817418245e-05
is_top_auto_parts_brand:-0.7957915360511377
is_premium_electronics_brand:-62.370519689392786
is_top_electronics_brand:215.64467316897324
is_top_auto_accessories_brand:16.102562364956402
text_length:0.011850189448517182
Intercept :178.83197297550592
RMSE :176.7695015203689
R-squared Score :0.08653086259074871


In [32]:
def linear_regression_pricer(item):
    features = get_features(item)
    features_df = pd.DataFrame([features])
    return model.predict(features_df)[0]

In [None]:
Tester_baseline.test(linear_regression_pricer)

#### BoW

In [34]:
# 1. Data Preparation
documents = train['test_prompt'].to_list()
# test prompt so the model doesn't see the price
np.random.seed(42)
prices = train['price'].astype(float).to_numpy()
# using np array maybe much better for setting random seeds 
# also LR converts it into array so better for efficiency

# 2. Processing and Training
vectorizer = CountVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(documents)
regressor = LinearRegression()
regressor.fit(X, prices)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [35]:
def bow_lr_pricer(item):
    x = vectorizer.transform([item.test_prompt])
    return max(regressor.predict(x)[0],0)  #ensuring no negetive price is returned , also regressor.predict() return a numpy array

In [None]:
Tester.test(bow_lr_pricer)

#### Word2Vec

In [None]:
np.random.seed(42)

processed_docs = [simple_preprocess(doc) for doc in documents]

#training 
w2v_mode = Word2Vec(sentences=processed_docs, 
                        vector_size=400 , 
                        window=5,
                        min_count=2, 
                        workers=8,
                        epochs = 2 ,
                    )

                

In [40]:
#taking average for each description of the product (not the best way-good for trial)
def document_vector(doc):
    doc_words = simple_preprocess(doc)
    word_vectors = [w2v_mode.wv[word] for word in doc_words if word in w2v_mode.wv]
    return np.mean(word_vectors , axis = 0) if word_vectors else np.zeros(w2v_mode.vector_size)

X_w2v = np.array([document_vector(doc) for doc in documents])


In [41]:
w2v_lr_regressor = LinearRegression()
w2v_lr_regressor.fit(X_w2v,prices)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [44]:
def w2v_lr_pricer(item):
    doc = item.test_prompt 
    doc_vector = document_vector(doc)
    return max(0,w2v_lr_regressor.predict([doc_vector])[0])

In [None]:
Tester.test(w2v_lr_pricer)

#### SVM

In [47]:
from sklearn.svm import LinearSVR

In [48]:
np.random.seed(42)
svr_regressor = LinearSVR()
svr_regressor.fit(X_w2v, prices)

0,1,2
,epsilon,0.0
,tol,0.0001
,C,1.0
,loss,'epsilon_insensitive'
,fit_intercept,True
,intercept_scaling,1.0
,dual,'auto'
,verbose,0
,random_state,
,max_iter,1000


In [50]:
def svr_pricer(item):
    np.random.seed(42)
    doc = item.test_prompt
    doc_vector = document_vector(doc)
    return max(0,float(svr_regressor.predict([doc_vector])[0]))

In [None]:
Tester.test(svr_pricer)

#### Random Forest

In [52]:
from sklearn.ensemble import RandomForestRegressor

In [54]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42 , n_jobs=8)
rf_model.fit(X_w2v , prices)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [55]:
def random_forest_pricer(item):
    doc = item.test_prompt
    doc_vector = document_vector(doc)
    return max(0,rf_model.predict([doc_vector])[0])

In [None]:
Tester.test(random_forest_pricer)