In [66]:
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
import pickle
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import seaborn as sn
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn import svm
from sklearn import ensemble
import cv2

df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,Product_id,Stall_no,instock_date,Market_Category,Customer_name,Loyalty_customer,Product_Category,Grade,Demand,Discount_avail,charges_1,charges_2 (%),Minimum_price,Maximum_price,Selling_Price
0,BRAE2NF6JA5GUEXG,37.0,2015-08-22 18:36:12.000,2,Lillyann,Yes,Fashion,1,68,0.0,376.0,11.0,2983.0,4713.0,4185.9477
1,TUNE8SFB6RJN2HSD,38.0,2016-03-27 21:19:13.000,24,Klynn,Yes,Fashion,0,51,0.0,397.0,12.0,7495.0,10352.0,9271.490256
2,BRAEAR7WZPQGPBZU,9.0,2015-08-18 19:25:22.000,447,Ridge,Yes,Child_care,0,10,0.0,250.0,9.0,5752.0,7309.0,6785.701362
3,WATDZ2ZQ8JPDHCTJ,50.0,2016-03-28 21:53:01.000,23,Abran,Yes,Educational,2,48,0.0,144.0,13.0,5090.0,20814.0,13028.917824
4,JWSEBUKYQPMBZ3RK,7.0,2016-03-29 22:58:53.000,63,Dustyn,Yes,Repair,1,35,1.0,211.0,4.0,2430.0,9261.0,906.553935


In [67]:
from datetime import datetime


def to_integer(d):
    d = d.split(' ')
    date = d[0].split('-')
    time = d[1].split(':')
    mydate = int(date[0])*31556952 + int(date[1]) * 2628288 + int(date[2]) * \
        86400 + int(time[0])*3600+int(time[1])*60+int(time[2].split('.')[0])
    return mydate


def clean_data(df):
    # dropping the non-required columns
    try:
        df = df.drop(["Product_id", "Customer_name"], axis="columns")
    except:
        pass
    
    # dropping NA columns
    df = df.dropna()
    
    # converting datetime to integer
    df["instock_date"] = df["instock_date"].apply(lambda x: to_integer(x))
    
    # working on Loyalty_customer
    dummies = pd.get_dummies(df["Loyalty_customer"])
    df = pd.concat([df,dummies], axis="columns")
    df = df.drop(["Loyalty_customer"], axis="columns")

    #working on Product_Category
    dummies = pd.get_dummies(df["Product_Category"])
    df = pd.concat([df,dummies], axis="columns")
    df = df.drop(["Product_Category"], axis="columns")    

    return df


df = clean_data(df)
df.head()

Unnamed: 0,Stall_no,instock_date,Market_Category,Grade,Demand,Discount_avail,charges_1,charges_2 (%),Minimum_price,Maximum_price,...,Child_care,Cosmetics,Educational,Fashion,Home_decor,Hospitality,Organic,Pet_care,Repair,Technology
0,37.0,63610252356,2,1,68,0.0,376.0,11.0,2983.0,4713.0,...,0,0,0,1,0,0,0,0,0,0
1,38.0,63629109649,24,0,51,0.0,397.0,12.0,7495.0,10352.0,...,0,0,0,1,0,0,0,0,0,0
2,9.0,63609909706,447,0,10,0.0,250.0,9.0,5752.0,7309.0,...,1,0,0,0,0,0,0,0,0,0
3,50.0,63629198077,23,2,48,0.0,144.0,13.0,5090.0,20814.0,...,0,0,1,0,0,0,0,0,0,0
4,7.0,63629288429,63,1,35,1.0,211.0,4.0,2430.0,9261.0,...,0,0,0,0,0,0,0,0,1,0


In [68]:
X = df.drop(["Selling_Price"], axis="columns")
y = df["Selling_Price"]

In [69]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [70]:
model = linear_model.LinearRegression()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.905939771899413

In [71]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,Product_id,Stall_no,instock_date,Market_Category,Customer_name,Loyalty_customer,Product_Category,Grade,Demand,Discount_avail,charges_1,charges_2 (%),Minimum_price,Maximum_price
0,SCHE4YSTDVPVZVXW,39.0,2016-01-13 07:45:08.000,205,Ivanka,No,Fashion,0,90,0,380.0,16.0,2576.0,3340
1,ACCEGCATKHNRXUHW,49.0,2015-08-23 20:37:05.000,3,Isaak,Yes,Fashion,0,87,0,393.0,16.0,1202.0,1955
2,NKCE6GJ5XVJDXNNZ,1.0,2015-11-14 18:12:39.000,183,Analiese,No,Technology,2,55,0,493.0,9.0,7175.0,15715
3,NKCEB8BK3ZXDHDHM,8.0,2015-11-21 04:56:19.000,358,Rusty,Yes,Child_care,3,86,0,303.0,16.0,5404.0,13078
4,TOPEFDXSAHRNPF94,33.0,2015-05-06 15:09:46.000,167,Eloise,No,Technology,0,27,0,567.0,16.0,4069.0,6244


In [72]:
def clean_test_data(df):
    df = df.drop(["Product_id"], axis="columns")
    df["instock_date"] = df["instock_date"].apply(lambda x: to_integer(x))
    df = df.drop(["Customer_name"], axis="columns")
    
    # working on Loyalty_customer
    dummies = pd.get_dummies(df["Loyalty_customer"])
    df = pd.concat([df,dummies], axis="columns")
    df = df.drop(["Loyalty_customer"], axis="columns")

    #working on Product_Category
    dummies = pd.get_dummies(df["Product_Category"])
    df = pd.concat([df,dummies], axis="columns")
    df = df.drop(["Product_Category"], axis="columns")

    df["Stall_no"] = df["Stall_no"].fillna(12)

    charges1_mean = df.charges_1.mean()
    df.charges_1 = df.charges_1.fillna(charges1_mean)

    charges2_mean = df["charges_2 (%)"].mean()
    df["charges_2 (%)"] = df["charges_2 (%)"].fillna(charges2_mean)

    min_price_mean = df["Minimum_price"].mean()
    df["Minimum_price"] = df["Minimum_price"].fillna(min_price_mean)
    
    return df

In [73]:
result = pd.DataFrame()
result["Product_id"] = test["Product_id"]
test = clean_test_data(test)
test.head()

Unnamed: 0,Stall_no,instock_date,Market_Category,Grade,Demand,Discount_avail,charges_1,charges_2 (%),Minimum_price,Maximum_price,...,Child_care,Cosmetics,Educational,Fashion,Home_decor,Hospitality,Organic,Pet_care,Repair,Technology
0,39.0,63622594628,205,0,90,0,380.0,16.0,2576.0,3340,...,0,0,0,1,0,0,0,0,0,0
1,49.0,63610346009,3,0,87,0,393.0,16.0,1202.0,1955,...,0,0,0,1,0,0,0,0,0,0
2,1.0,63617444607,183,2,55,0,493.0,9.0,7175.0,15715,...,0,0,0,0,0,0,0,0,0,1
3,8.0,63618001627,358,3,86,0,303.0,16.0,5404.0,13078,...,1,0,0,0,0,0,0,0,0,0
4,33.0,63600972706,167,0,27,0,567.0,16.0,4069.0,6244,...,0,0,0,0,0,0,0,0,0,1


In [74]:
test.isna().sum()

Stall_no           0
instock_date       0
Market_Category    0
Grade              0
Demand             0
Discount_avail     0
charges_1          0
charges_2 (%)      0
Minimum_price      0
Maximum_price      0
No                 0
Yes                0
Child_care         0
Cosmetics          0
Educational        0
Fashion            0
Home_decor         0
Hospitality        0
Organic            0
Pet_care           0
Repair             0
Technology         0
dtype: int64

In [82]:
result["Selling_Price"] = model.predict(test)
def fun(x):
    if x < 0:
        x = 0

    return round(x, 6)
result["Selling_Price"] = result["Selling_Price"].apply(lambda x: fun(x))
result.to_csv("submission.csv", index=False)