In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV, LassoCV, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error
from math import sqrt

In [15]:
listing_df = pd.read_pickle('etsy-painting-listings-cleaned.pkl')

In [16]:
#categorize when_made into cutomized buckets
listing_df.loc[listing_df['when_made']==False,'when_made']='Unkown'
listing_df.loc[listing_df['when_made']=='before_2002','when_made']='2000_2009'
listing_df.loc[listing_df['when_made']=='1980s','when_made']='1960_1999'
listing_df.loc[listing_df['when_made']=='2002_2009','when_made']='2000_2009'
listing_df.loc[listing_df['when_made']=='1970s','when_made']='1960_1999'
listing_df.loc[listing_df['when_made']=='1990s','when_made']='1960_1999'
listing_df.loc[listing_df['when_made']=='1900s','when_made']='1900_1959'
listing_df.loc[listing_df['when_made']=='1960s','when_made']='1960_1999'
listing_df.loc[listing_df['when_made']=='1950s','when_made']='1900_1959'
listing_df.loc[listing_df['when_made']=='1910s','when_made']='1900_1959'
listing_df.loc[listing_df['when_made']=='1920s','when_made']='1900_1959'
listing_df.loc[listing_df['when_made']=='1930s','when_made']='1900_1959'
listing_df.loc[listing_df['when_made']=='1940s','when_made']='1900_1959'
listing_df.loc[listing_df['when_made']=='2000_2001','when_made']='2000_2009'
listing_df.loc[listing_df['when_made']=='1700s','when_made']='before_1800'
listing_df.loc[listing_df['when_made']=='before_1700','when_made']='before_1800'
listing_df

Unnamed: 0_level_0,price,materials,who_made,when_made,item_length,item_width,item_height,is_customizable,is_digital,has_variations,is_vintage
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
915817036,200.00,[Stretched canvas],i_did,2020_2021,,,,False,False,False,False
1076723839,55.00,[Stretched canvas],i_did,2010_2019,,,,False,False,False,False
985085465,7.98,[Canvas board],i_did,2020_2021,330.0,2.0,218.0,False,False,True,False
957968232,7.98,[Canvas board],i_did,2020_2021,330.0,2.0,218.0,False,False,True,False
957936450,7.98,[Canvas board],i_did,2020_2021,330.0,2.0,218.0,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...
110594622,558.00,[acrylic on cotton canvas],i_did,2010_2019,,,,True,False,False,False
124641650,553.00,[oil and acrylic on cotton canvas],i_did,2010_2019,,,,True,False,False,False
735178364,549.00,[oil and acrylic on linen canvas],i_did,2010_2019,,,,True,False,False,False
220667454,585.00,[oil and acrylic on linen canvas],i_did,2010_2019,,,,True,False,False,False


In [17]:
#listing_dim_df drops all na values in product dimension
listing_dim_df = listing_df.dropna()

In [18]:
#deal with categorical and binary variables
#remove listing id as row 
listing_dim_df = listing_dim_df.reset_index(drop=True)
#one hot encoding who made
enc = OneHotEncoder(handle_unknown='ignore')
enc_df = pd.DataFrame(enc.fit_transform(listing_dim_df[['who_made']]).toarray(),
                      columns=['collective','i_did','someone_else'])
enc.get_feature_names() #to see how features are ordered
listing_dim_df = listing_dim_df.join(enc_df)
listing_dim_df.drop(['who_made'],axis=1,inplace=True)
#one hot encoding when made
enc = OneHotEncoder(handle_unknown='ignore')
enc_df = pd.DataFrame(enc.fit_transform(listing_dim_df[['when_made']]).toarray(),
                      columns=enc.get_feature_names())
listing_dim_df = listing_dim_df.join(enc_df)
listing_dim_df.drop(['when_made'],axis=1,inplace=True)
#turn binary variables into 0,1
listing_dim_df['is_customizable'] = listing_dim_df['is_customizable'].astype('int')
listing_dim_df['is_digital'] = listing_dim_df['is_digital'].astype('int')
listing_dim_df['has_variations'] = listing_dim_df['has_variations'].astype('int')
listing_dim_df['is_vintage'] = listing_dim_df['is_vintage'].astype('int')

In [19]:
#bag of words model to analyze materials
# listing_dim_df['materials'] = listing_dim_df['materials'].apply(lambda x: ','.join([word.strip() for word in x]))
# vectorizer = CountVectorizer(token_pattern='(?u)[a-zA-Z][a-z ]+')
# vec_df = pd.DataFrame(vectorizer.fit_transform(listing_dim_df['materials'].values).toarray(),
#                       columns = vectorizer.get_feature_names())
# listing_dim_df = listing_dim_df.join(vec_df)
listing_dim_df.drop(['materials'],axis=1,inplace=True)
listing_dim_df

Unnamed: 0,price,item_length,item_width,item_height,is_customizable,is_digital,has_variations,is_vintage,collective,i_did,someone_else,x0_1800s,x0_1900_1959,x0_1960_1999,x0_2000_2009,x0_2010_2019,x0_2020_2021,x0_Unkown,x0_before_1800,x0_made_to_order
0,7.98,330.0,2.0,218.00,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,7.98,330.0,2.0,218.00,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,7.98,330.0,2.0,218.00,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,110.00,1016.0,101.6,762.00,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,16.99,254.0,203.2,50.80,0,0,0,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2891,5.00,228.6,228.6,127.00,1,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2892,50.00,635.0,508.0,101.60,0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2893,50.00,279.4,228.6,19.05,0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2894,90.00,355.6,304.8,25.40,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [20]:
#ridge regression
X = listing_dim_df.drop(['price'],axis=1)
y = listing_dim_df['price']
alphas = 10**np.linspace(10,-2,num=100)*0.5
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
ridge = RidgeCV(alphas=alphas,normalize=True,scoring='neg_mean_squared_error')
ridge.fit(X_train, y_train)

ridge.coef_
ridge.best_score_

-209115.3342735858

In [21]:
#lasso
alphas = np.logspace(-4,2,50)
lassocv = LassoCV(alphas=alphas,normalize=True,max_iter=10000)
lassocv.fit(X_train, y_train)
lasso = Lasso(alpha=lassocv.alpha_)
lasso.fit(X_train, y_train)

pred_train_lasso= lasso.predict(X_train)
print(mean_squared_error(y_train,pred_train_lasso))
print(r2_score(y_train, pred_train_lasso))
pred_test_lasso= lasso.predict(X_test)
print(r2_score(y_test, pred_test_lasso))


200852.77161948237
0.06671587837021686
-0.017040873180517613


In [9]:
print(list(zip(lasso.coef_[lasso.coef_!=0.0], X_train.columns)))

[(-0.13704536807858977, 'item_length'), (0.3060352639991978, 'item_width'), (0.7685059148549654, 'item_height'), (-45.83933631477661, 'is_customizable'), (238.76162370112385, 'is_digital'), (-18.061873505717713, 'has_variations'), (-16.342590764785168, 'is_vintage'), (65.02469173352274, 'collective'), (138.84109318017659, 'i_did'), (-2.612186251909193, 'someone_else'), (642.9087099596192, 'x0_1800s'), (58.599749963152085, 'x0_1900_1959'), (4.328517421104929, 'x0_1960_1999')]


In [10]:
X_train.columns

Index(['item_length', 'item_width', 'item_height', 'is_customizable',
       'is_digital', 'has_variations', 'is_vintage', 'collective', 'i_did',
       'someone_else',
       ...
       'x ', 'x watercolour paints', 'xuan paper', 'yarn',
       'year archival hdr pigment inks', 'yellow',
       'yellow gold green blue pouring art', 'yellow ochre', 'yupo paper',
       'zinc canning lid'],
      dtype='object', length=1477)

In [23]:
#random forest regression
random_forest = RandomForestRegressor()
random_forest.fit(X_train, y_train)
rf_tr_pred = random_forest.predict(X_train)
rf_test_pred = random_forest.predict(X_test)

In [24]:
print('training mse ',mean_squared_error(y_train, rf_tr_pred))
print('testing mse ',mean_squared_error(y_test, rf_test_pred))
print('training r squared ', r2_score(y_train, rf_tr_pred))
print('testing r squared ', r2_score(y_test, rf_test_pred))

training mse  37926.59987314116
testing mse  173414.66399471764
training r squared  0.8237699526692744
testing r squared  -0.36148996737828076
