In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#### 1.캘리포니아 주택 가격 데이터셋을 training/test로 나누고 training set을 이용하여 SVM 모형을 훈련시키고 training과 test set에서의 정확도를 계산하시오.


In [2]:
# load data
house = pd.read_csv('housing.csv')
house.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [4]:
house.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
longitude,20640.0,-119.569704,2.003532,-124.35,-121.8,-118.49,-118.01,-114.31
latitude,20640.0,35.631861,2.135952,32.54,33.93,34.26,37.71,41.95
housing_median_age,20640.0,28.639486,12.585558,1.0,18.0,29.0,37.0,52.0
total_rooms,20640.0,2635.763081,2181.615252,2.0,1447.75,2127.0,3148.0,39320.0
total_bedrooms,20433.0,537.870553,421.38507,1.0,296.0,435.0,647.0,6445.0
population,20640.0,1425.476744,1132.462122,3.0,787.0,1166.0,1725.0,35682.0
households,20640.0,499.53968,382.329753,1.0,280.0,409.0,605.0,6082.0
median_income,20640.0,3.870671,1.899822,0.4999,2.5634,3.5348,4.74325,15.0001
median_house_value,20640.0,206855.816909,115395.615874,14999.0,119600.0,179700.0,264725.0,500001.0


In [5]:
# categorical 변수 dummy화
house = pd.get_dummies(house)
house.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0


In [6]:
house.isnull().sum()

longitude                       0
latitude                        0
housing_median_age              0
total_rooms                     0
total_bedrooms                207
population                      0
households                      0
median_income                   0
median_house_value              0
ocean_proximity_<1H OCEAN       0
ocean_proximity_INLAND          0
ocean_proximity_ISLAND          0
ocean_proximity_NEAR BAY        0
ocean_proximity_NEAR OCEAN      0
dtype: int64

In [7]:
# total_bedrooms 변수의 null값: 해당 변수의 평균값으로 대체
house['total_bedrooms'].fillna(house['total_bedrooms'].median(), inplace=True)
house.isnull().sum()

longitude                     0
latitude                      0
housing_median_age            0
total_rooms                   0
total_bedrooms                0
population                    0
households                    0
median_income                 0
median_house_value            0
ocean_proximity_<1H OCEAN     0
ocean_proximity_INLAND        0
ocean_proximity_ISLAND        0
ocean_proximity_NEAR BAY      0
ocean_proximity_NEAR OCEAN    0
dtype: int64

In [8]:
# split data: train data / test data
x = house.drop("median_house_value", axis=1)
y = house.median_house_value

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2,
                                                   random_state=777)

In [9]:
train_x.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
11556,-117.99,33.73,17.0,5239.0,1045.0,2440.0,985.0,4.375,1,0,0,0,0
11862,-121.25,40.27,25.0,958.0,245.0,28.0,16.0,2.625,0,1,0,0,0
3236,-119.58,36.1,21.0,1382.0,327.0,1469.0,355.0,1.3967,0,1,0,0,0
19330,-122.85,38.46,22.0,3328.0,550.0,1309.0,512.0,4.7105,1,0,0,0,0
11711,-120.18,39.17,18.0,1703.0,360.0,354.0,163.0,3.6563,0,1,0,0,0


In [10]:
train_y.head()

11556    248100.0
11862     67500.0
3236      46500.0
19330    266200.0
11711    146900.0
Name: median_house_value, dtype: float64

In [11]:
# model fitting
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

svr = SVR(C=100, epsilon = 0.1)

svr.fit(train_x, train_y)

SVR(C=100)

In [13]:
# mse for train data
pred_y_train = svr.predict(train_x)
mse_train = mean_squared_error(train_y, pred_y_train)
mse_train

9056963154.425293

In [14]:
# mse for test data
pred_y_test = svr.predict(test_x)
mse_test = mean_squared_error(test_y, pred_y_test)
mse_test

8672017935.573608

In [79]:
mse

8672017935.573608

In [81]:
from sklearn.metrics import r2_score
r2_score(test_y, pred_y)

0.3272547009604361

#### 2.spam data: Training/test로 나누고 training set을 이용하여 SVM 모형을 훈련시키고 training과 test set에서의 정확도를 계산하시오.

In [2]:
spam = pd.read_csv('SPAM.csv')
spam = spam.drop('testid', axis=1)
spam.head()

Unnamed: 0,spam,make,address,all,3d,our,over,remove,internet,order,...,conference,ch;,ch(,ch[,ch!,ch$,ch#,crl.ave,crl.long,crl.tot
0,True,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278
1,True,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,...,0.0,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028
2,True,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,...,0.0,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259
3,True,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,...,0.0,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191
4,True,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,...,0.0,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191


In [3]:
spam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4601 entries, 0 to 4600
Data columns (total 58 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   spam        4601 non-null   bool   
 1   make        4601 non-null   float64
 2   address     4601 non-null   float64
 3   all         4601 non-null   float64
 4   3d          4601 non-null   float64
 5   our         4601 non-null   float64
 6   over        4601 non-null   float64
 7   remove      4601 non-null   float64
 8   internet    4601 non-null   float64
 9   order       4601 non-null   float64
 10  mail        4601 non-null   float64
 11  receive     4601 non-null   float64
 12  will        4601 non-null   float64
 13  people      4601 non-null   float64
 14  report      4601 non-null   float64
 15  addresses   4601 non-null   float64
 16  free        4601 non-null   float64
 17  business    4601 non-null   float64
 18  email       4601 non-null   float64
 19  you         4601 non-null  

In [4]:
spam.isnull().sum()

spam          0
make          0
address       0
all           0
3d            0
our           0
over          0
remove        0
internet      0
order         0
mail          0
receive       0
will          0
people        0
report        0
addresses     0
free          0
business      0
email         0
you           0
credit        0
your          0
font          0
000           0
money         0
hp            0
hpl           0
george        0
650           0
lab           0
labs          0
telnet        0
857           0
data          0
415           0
85            0
technology    0
1999          0
parts         0
pm            0
direct        0
cs            0
meeting       0
original      0
project       0
re            0
edu           0
table         0
conference    0
ch;           0
ch(           0
ch[           0
ch!           0
ch$           0
ch#           0
crl.ave       0
crl.long      0
crl.tot       0
dtype: int64

In [5]:
x = spam.drop("spam", axis=1)
y = spam.spam

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2,
                                                   random_state=777)

In [9]:
# model fitting
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

svm = SVC(coef0=1, C=5)

svm.fit(train_x, train_y)

SVC(C=5, coef0=1)

In [12]:
# accuracy for train data
pred_y_train = svm.predict(train_x)
print(classification_report(train_y, pred_y_train))

              precision    recall  f1-score   support

       False       0.96      0.98      0.97      2253
        True       0.97      0.94      0.95      1427

    accuracy                           0.96      3680
   macro avg       0.96      0.96      0.96      3680
weighted avg       0.96      0.96      0.96      3680



In [13]:
# accuracy for test data
pred_y_test = svm.predict(test_x)
print(classification_report(test_y, pred_y_test))

              precision    recall  f1-score   support

       False       0.92      0.96      0.94       535
        True       0.94      0.88      0.91       386

    accuracy                           0.93       921
   macro avg       0.93      0.92      0.92       921
weighted avg       0.93      0.93      0.93       921

