 # **Import Library**

In [2]:
import pandas as pd                         # Import Pandas
import statsmodels.api as sm                # Load statmodels
data = pd.read_csv("cleanDiamods.csv")      # Import dataset 

# **Processing Model**

In [3]:
y = data["price"]
x = data[["carat","x","y","z","table","depth","cut","color","clarity"]]
x = sm.add_constant(x)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 101)

mod2 = sm.OLS(y_train, X_train)                          # Create Modal
results2 = mod2.fit() 

results2.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.879
Model:,OLS,Adj. R-squared:,0.879
Method:,Least Squares,F-statistic:,10770.0
Date:,"Wed, 17 Nov 2021",Prob (F-statistic):,0.0
Time:,20:50:59,Log-Likelihood:,-79262.0
No. Observations:,13351,AIC:,158500.0
Df Residuals:,13341,BIC:,158600.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2215.3961,575.603,-3.849,0.000,-3343.659,-1087.133
carat,3877.2509,103.613,37.420,0.000,3674.154,4080.348
x,1721.7128,65.168,26.420,0.000,1593.975,1849.451
y,-1136.8569,64.945,-17.505,0.000,-1264.158,-1009.556
z,-1190.1302,199.100,-5.978,0.000,-1580.395,-799.865
table,-1.7343,0.531,-3.268,0.001,-2.775,-0.694
depth,47.4263,9.178,5.167,0.000,29.436,65.416
cut,-16.5138,0.998,-16.555,0.000,-18.469,-14.558
color,-51.5837,0.506,-101.985,0.000,-52.575,-50.592

0,1,2,3
Omnibus:,350.292,Durbin-Watson:,1.974
Prob(Omnibus):,0.0,Jarque-Bera (JB):,755.62
Skew:,0.14,Prob(JB):,8.3e-165
Kurtosis:,4.131,Cond. No.,65200.0


 # **Add Predicted to Table**

In [5]:
print(X_test)

       const  carat     x     y     z  table  depth  cut  color  clarity
13441    1.0   0.40  4.80  4.76  2.89   59.0   60.5    1      0        5
5416     1.0   0.31  4.35  4.31  2.67   59.0   61.7    1      0        6
7331     1.0   0.36  4.63  4.58  2.79   56.0   60.6    0      0        6
9141     1.0   0.30  4.28  4.30  2.68   56.0   62.5    0      1        2
8798     1.0   0.42  4.79  4.82  2.96   58.0   61.6    1      4        4
...      ...    ...   ...   ...   ...    ...    ...  ...    ...      ...
3894     1.0   0.30  4.29  4.32  2.68   58.0   62.3    1      4        2
14735    1.0   0.33  4.40  4.43  2.74   58.0   62.1    1      0        7
11421    1.0   0.32  4.41  4.38  2.68   59.0   61.0    1      3        1
3643     1.0   0.31  4.42  4.46  2.65   55.0   59.7    2      2        4
12045    1.0   0.50  5.08  5.11  3.16   55.0   62.0    0      5        6

[5723 rows x 10 columns]


In [6]:
price_predect = []
X_test = X_test.reset_index()
for i in range(len(X_test)):
    price_predect.append(-2215.3961	
                    +3877.2509 *X_test["carat"][i] 
                    +1721.7128 *X_test["x"][i]
                    -1136.8569 *X_test["y"][i] 
                    -1190.1302 *X_test["z"][i]
                    -1.7343	 *X_test["table"][i] 
                    +47.4263 *X_test["depth"][i]
                    -16.5138 *X_test["cut"][i]
                    -51.5837 *X_test["color"][i]
                    -81.4655 *X_test["clarity"][i])

indexs = [i for i in range(len(X_test))]
d = dict(zip(indexs,price_predect) )

predict_data = pd.concat([y_test.reset_index() , pd.Series(d, name='predict')], axis = 'columns')
predict_data

Unnamed: 0,index,price,predict
0,13441,1050,1091.936728
1,5416,732,717.073696
2,7331,794,912.796608
3,9141,862,891.184598
4,8798,847,929.778020
...,...,...,...
5718,3894,684,701.445828
5719,14735,492,600.211732
5720,11421,952,987.045678
5721,3643,679,646.202061


In [7]:
predict_data.tail(20)

Unnamed: 0,index,price,predict
5703,14048,1080,1096.490071
5704,7324,794,867.694854
5705,9911,889,872.495783
5706,3439,672,695.966325
5707,4566,706,726.620299
5708,12063,983,859.920233
5709,13918,1075,1069.274248
5710,5323,730,718.634963
5711,18814,544,597.587785
5712,11920,977,845.062142


 # **Check Mean Absolute Error**

In [8]:
ans = []
for i in predict_data.index:
    d = predict_data["price"][i] - predict_data["predict"][i]
    ans.append(abs(d))
print(sum(ans) / len(ans))

69.57025680010494


 # **Check Mean Absolute Percen Error**

In [9]:
summm = 0
for i in predict_data.index:
    d = predict_data["price"][i] - predict_data["predict"][i]
    summm += abs(d) / predict_data["price"][i]

mape=(summm / len(predict_data))*100
print("mape =",mape)

mape = 8.65186738879971
