## *Import Library*

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from scipy.stats import shapiro

# Sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Statmodels
import statsmodels
import statsmodels.api as sm

In [30]:
df = pd.read_csv('Data_wine_aroma.csv')
df.head(10)

Unnamed: 0,Cd,Mo,Mn,Ni,Cu,Al,Ba,Cr,Sr,Pb,B,Mg,Si,Na,Ca,P,K,Aroma
0,0.005,0.044,1.51,0.122,0.83,0.982,0.387,0.029,1.23,0.561,2.63,128.0,17.3,66.8,80.5,150.0,1130,3.3
1,0.05,0.106,1.25,0.114,0.055,1.27,0.275,0.019,1.05,0.491,6.56,172.0,18.7,15.7,112.0,137.0,1290,4.6
2,0.056,0.146,1.1,0.088,0.643,1.29,0.308,0.035,1.14,0.73,3.05,127.0,15.8,35.4,91.0,161.0,1160,3.9
3,0.077,0.261,1.65,0.073,0.285,0.596,0.078,0.063,0.156,1.02,5.04,94.6,6.34,10.4,54.9,132.0,899,7.1
4,0.011,0.363,1.38,0.16,0.051,1.32,0.38,0.059,1.13,1.73,3.07,138.0,16.7,76.6,84.6,164.0,1090,5.6
5,0.034,0.05,1.15,0.058,0.058,1.35,0.294,0.006,1.12,0.206,2.71,120.0,14.7,68.1,64.8,133.0,1050,4.6
6,0.025,0.479,1.07,0.168,0.753,0.715,0.164,0.062,0.823,2.06,4.57,179.0,17.8,98.5,122.0,184.0,1170,4.8
7,0.067,0.166,1.53,0.041,0.043,0.512,0.132,0.026,0.229,0.699,7.27,107.0,6.0,55.2,44.9,148.0,854,7.7
8,0.061,0.245,1.61,0.07,0.172,2.07,0.071,0.053,0.186,1.19,4.42,87.6,7.62,11.6,70.6,156.0,820,5.5
9,0.033,0.074,1.28,0.098,0.053,1.35,0.329,0.03,1.07,0.552,3.3,140.0,16.3,70.5,74.7,159.0,1100,4.3


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Cd      41 non-null     float64
 1   Mo      41 non-null     float64
 2   Mn      41 non-null     float64
 3   Ni      41 non-null     float64
 4   Cu      41 non-null     float64
 5   Al      41 non-null     float64
 6   Ba      41 non-null     float64
 7   Cr      41 non-null     float64
 8   Sr      41 non-null     float64
 9   Pb      41 non-null     float64
 10  B       41 non-null     float64
 11  Mg      41 non-null     float64
 12  Si      41 non-null     float64
 13  Na      41 non-null     float64
 14  Ca      41 non-null     float64
 15  P       41 non-null     float64
 16  K       41 non-null     int64  
 17  Aroma   41 non-null     float64
dtypes: float64(17), int64(1)
memory usage: 5.9 KB


In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Cd,41.0,0.053805,0.023502,0.005,0.039,0.058,0.067,0.106
Mo,41.0,0.177366,0.095534,0.009,0.126,0.166,0.234,0.479
Mn,41.0,1.64622,0.601835,0.906,1.25,1.61,1.76,4.08
Ni,41.0,0.102024,0.082417,0.041,0.061,0.073,0.112,0.466
Cu,41.0,0.17061,0.188871,0.038,0.058,0.099,0.171,0.83
Al,41.0,0.925317,0.470496,0.308,0.596,0.811,1.21,2.07
Ba,41.0,0.216293,0.106508,0.07,0.105,0.225,0.299,0.408
Cr,41.0,0.04239,0.018888,0.006,0.028,0.038,0.053,0.1
Sr,41.0,0.800512,0.445945,0.156,0.36,0.898,1.12,2.14
Pb,41.0,0.864146,0.458837,0.025,0.561,0.777,1.19,2.06


In [None]:
## *Missing Values*

In [8]:
df.isna().sum()

Cd       0
Mo       0
Mn       0
Ni       0
Cu       0
Al       0
Ba       0
Cr       0
Sr       0
Pb       0
B        0
Mg       0
Si       0
Na       0
Ca       0
P        0
K        0
Aroma    0
dtype: int64

In [None]:
## *Cek Duplikat Data*

In [9]:
duplicate = df[df.duplicated()]
duplicate

Unnamed: 0,Cd,Mo,Mn,Ni,Cu,Al,Ba,Cr,Sr,Pb,B,Mg,Si,Na,Ca,P,K,Aroma
20,0.061,0.245,1.61,0.07,0.172,2.07,0.071,0.053,0.186,1.19,4.42,87.6,7.62,11.6,70.6,156.0,820,5.5
33,0.065,0.211,1.65,0.102,0.055,0.308,0.206,0.028,0.72,1.02,6.12,99.3,27.1,20.5,95.2,194.0,1260,5.1
37,0.039,0.071,1.19,0.043,0.163,0.971,0.105,0.028,0.491,0.31,6.56,103.0,9.47,45.3,67.9,133.0,1090,5.1


In [15]:
# Cek ukuran data apakah duplicate sudah terhapus
df.shape

(41, 18)

In [22]:
df.columns

Index(['Cd', 'Mo', 'Mn', 'Ni', 'Cu', 'Al', 'Ba', 'Cr', 'Sr', 'Pb', 'B', 'Mg',
       'Si', 'Na', 'Ca', 'P', 'K', 'Aroma'],
      dtype='object')

In [23]:
x = df.drop(columns='Aroma')
y = df['Aroma']

In [13]:
from sklearn.metrics import mean_squared_error, r2_score

In [14]:
lr = LinearRegression()
lr.fit(x,y)
y_hat = lr.predict(x)
print('MSE :', mean_squared_error(y, y_hat))
print('R-square :', r2_score(y, y_hat))

MSE : 0.16607533836114344
R-square : 0.8373972603034061


In [24]:
df.shape

(41, 18)

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [17]:
x_train.shape, x_test.shape

((32, 17), (9, 17))

In [25]:
lr.fit(x_train, y_train)
y_hat = lr.predict(x_test)
print('MSE :', mean_squared_error(y_test, y_hat))
print('R-square :', r2_score(y_test, y_hat))

MSE : 0.7278396191563388
R-square : 0.3805945665931557


In [26]:
df.nunique()

Cd       33
Mo       35
Mn       30
Ni       33
Cu       33
Al       37
Ba       36
Cr       31
Sr       36
Pb       34
B        35
Mg       35
Si       33
Na       36
Ca       37
P        34
K        33
Aroma    25
dtype: int64

# DATA HOUSE PRICE

In [31]:
data = pd.read_csv('house_prices.csv')
data.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [35]:
data.isna().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [38]:
kategorikal = data.select_dtypes(include='object').columns
kategorikal

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [40]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in kategorikal:
  data[i] = le.fit_transform(data[i])

In [41]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   int32  
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   int32  
 6   Alley          1460 non-null   int32  
 7   LotShape       1460 non-null   int32  
 8   LandContour    1460 non-null   int32  
 9   Utilities      1460 non-null   int32  
 10  LotConfig      1460 non-null   int32  
 11  LandSlope      1460 non-null   int32  
 12  Neighborhood   1460 non-null   int32  
 13  Condition1     1460 non-null   int32  
 14  Condition2     1460 non-null   int32  
 15  BldgType       1460 non-null   int32  
 16  HouseStyle     1460 non-null   int32  
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [42]:
data.nunique()

Id               1460
MSSubClass         15
MSZoning            5
LotFrontage       110
LotArea          1073
                 ... 
MoSold             12
YrSold              5
SaleType            9
SaleCondition       6
SalePrice         663
Length: 81, dtype: int64