#### sklearn.svm.SVC

* class sklearn.svm.SVC(*, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None)[source]

* parameters:
    - C : float, default=1.0 ,c값이 클수록 복잡한 모형 대신 오버피팅이 되면서 margine이 작아짐
    - Gamma : {‘scale’, ‘auto’} or float, default=’scale’ , Gamma값이 클수록 복잡한 모형 클수록 서포트벡터로 잡는 벡터의 수가 작아짐
    - kernel : {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} or callable, default=’rbf’

In [22]:
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
x = iris.data
y = iris.target

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.5)

clf = SVC(gamma=0.001,C=100)
clf.fit(x_train,y_train)
pred = clf.predict(x_test)
acc = accuracy_score(y_test,pred)

print(acc)

pram = {'C':[1,10,100],
       'gamma':[0.1,0.01,0.001]}
grid = GridSearchCV(clf,param_grid=pram,cv=3,scoring="accuracy")
grid.fit(x_train,y_train)
best_clf = grid.best_estimator_
best_pred = best_clf.predict(x_test)
best_acc = accuracy_score(y_test,best_pred)
print(best_acc)

0.9466666666666667
0.96


### human

In [2]:
import pandas as pd

feature_name_df = pd.read_csv('./datasets/human_activity/features.txt',sep='\s+',
           header=None,names=['column_index','column_name'])
feature_name = feature_name_df.iloc[:,1].values.tolist()

def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby('column_name').cumcount(),columns=['dup_cnt'])
    feature_dup_df = feature_dup_df.reset_index()
    
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(),feature_dup_df,how='outer')
    new_feature_name_df['column_name'] =new_feature_name_df[['column_name','dup_cnt']].apply(lambda x : x[0]+'_'+str(x[1]) if x[1] >0 else x[0],axis=1)
    
    new_feature_name_df = new_feature_name_df.drop(['index'],axis=1)
    return new_feature_name_df

def get_human_dataset():
    # 각 데이터 파일들은 공백으로 분리되어 있으므로 read_csv에서 공백 문자를 sep으로 할당
    feature_name_df = pd.read_csv('./datasets/human_activity/features.txt',sep='\s+',
           header=None,names=['column_index','column_name'])
    # 중복된 피처명을 수정하는 get_new_feature_name_df를 이용 신규 피쳐명 df생성
    new_feature_name_df = get_new_feature_name_df(feature_name_df)
    
    # DF에 피쳐명을 컬럼으로 부여하기 위해 리스트 객체로 다시 변환
    feature_name = new_feature_name_df.iloc[:,1].values.tolist()
    
    # 학습 피처 데이터 셋과 테스트 피처 데이터 셋을 DF로 로딩 컬러명은 feature_name적용
    x_train = pd.read_csv('./datasets/human_activity/train/X_train.txt',sep="\s+",names=feature_name)
    x_test = pd.read_csv('./datasets/human_activity/test/X_test.txt',sep="\s+",names=feature_name)
    
    # 학습 레이블과 테스트 레이블 데이터를 DF로 할당 컬럼명은 action으로 부여
    y_train = pd.read_csv('./datasets/human_activity/train/y_train.txt',sep="\s+",header=None,names=['action'])
    y_test = pd.read_csv('./datasets/human_activity/test/y_test.txt',sep="\s+",header=None,names=['action'])
    
    # 로드된 학습/테스트용 DataFrame을 모두 변환
    return x_train,x_test,y_train,y_test

x_train,x_test,y_train,y_test=get_human_dataset()

In [34]:
y_test

Unnamed: 0,action
0,5
1,5
2,5
3,5
4,5
...,...
2942,2
2943,2
2944,2
2945,2


In [24]:
### 랜덤포레스트
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

rf_clf = RandomForestClassifier(random_state=156)
rf_clf.fit(x_train,y_train)
pred = rf_clf.predict(x_test)
accuracy = accuracy_score(y_test,pred)
print(f'랜덤포레스트 예측 정확도: {accuracy}')
print(f'기본 하이퍼파라미터 {rf_clf.get_params()}')


### gridsearchcv
from sklearn.model_selection import GridSearchCV

## 하이퍼파라미터 gridsearchcv
param = {'max_depth':[6,8,10,12],
        'n_estimators':[100],
        'min_samples_leaf':[8,12,18],
        'min_samples_split':[8,16,20]}

rf_clf = RandomForestClassifier(random_state=156,n_jobs=-1)
grid_cv = GridSearchCV(rf_clf,param_grid=param,cv=2,scoring='accuracy',n_jobs=-1)
grid_cv.fit(x_train,y_train)
best_dt = grid_cv.best_estimator_

### 변수선택
ftf_importances_values = best_dt.feature_importances_
importances = pd.Series(ftf_importances_values,index=x_train.columns)
top10 = importances.sort_values(ascending=False)[:10]

  rf_clf.fit(x_train,y_train)


랜덤포레스트 예측 정확도: 0.9236511706820495
기본 하이퍼파라미터 {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 156, 'verbose': 0, 'warm_start': False}


  self.best_estimator_.fit(X, y, **fit_params)


In [3]:
### 랜덤포레스트
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

rf_clf = RandomForestClassifier(random_state=156)
rf_clf.fit(x_train,y_train)

  rf_clf.fit(x_train,y_train)


In [38]:
feature_importaces = rf_clf.feature_importances_

df = pd.DataFrame(feature_importaces,index=x_train.columns,  columns=['column_name'] )
df_sort = df.sort_values('column_name',ascending=False )
feature_selection_name = df_sort.index[:20]

In [39]:

x_train_10 = x_train[feature_selection_name]
x_train_10


Unnamed: 0,tGravityAcc-min()-X,tGravityAcc-energy()-X,"angle(Y,gravityMean)",tGravityAcc-mean()-Y,tGravityAcc-max()-Y,tGravityAcc-min()-Y,tGravityAcc-energy()-Y,"angle(X,gravityMean)",tGravityAcc-mean()-X,tGravityAcc-max()-X,"fBodyAccJerk-bandsEnergy()-1,24","fBodyAccJerk-bandsEnergy()-1,8",tGravityAccMag-std(),"angle(Z,gravityMean)",tGravityAcc-max()-Z,"tGravityAcc-arCoeff()-Y,2",tBodyAcc-max()-X,tBodyAccJerk-std()-X,tBodyAccJerkMag-mean(),tGravityAcc-mean()-Z
0,0.977436,0.899469,0.179941,-0.140840,-0.161265,-0.123213,-0.970905,-0.841247,0.963396,0.892055,-0.999981,-0.999986,-0.950551,-0.058627,0.124660,0.720862,-0.934724,-0.993519,-0.993306,0.115375
1,0.984520,0.907829,0.180289,-0.141551,-0.161343,-0.114893,-0.970583,-0.844788,0.966561,0.892060,-0.999974,-0.999996,-0.976057,-0.054317,0.122586,0.125345,-0.943068,-0.995548,-0.991253,0.109379
2,0.986770,0.908668,0.180637,-0.142010,-0.163711,-0.114893,-0.970368,-0.848933,0.966878,0.892401,-0.999909,-0.999994,-0.988020,-0.049118,0.094566,0.270500,-0.938692,-0.990743,-0.988531,0.101884
3,0.986821,0.910621,0.181935,-0.143976,-0.163711,-0.121336,-0.969400,-0.848649,0.967615,0.893817,-0.999927,-0.999998,-0.986421,-0.047663,0.093425,0.228310,-0.938692,-0.992697,-0.993078,0.099850
4,0.987434,0.912235,0.185151,-0.148750,-0.166786,-0.121834,-0.967051,-0.847865,0.968224,0.893817,-0.999975,-0.999995,-0.991275,-0.043892,0.091682,0.089943,-0.942469,-0.996420,-0.993480,0.094486
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7347,0.920554,0.794766,0.238604,-0.222004,-0.214233,-0.234998,-0.918375,-0.791883,0.923148,0.867718,-0.724214,-0.839256,-0.093688,0.049819,-0.016391,0.684225,0.210795,-0.299527,-0.413920,-0.039492
7348,0.920554,0.782407,0.252676,-0.242054,-0.231477,-0.234998,-0.902880,-0.771840,0.918343,0.866116,-0.767363,-0.854278,-0.148539,0.050053,-0.016391,0.654116,0.117440,-0.350932,-0.434071,-0.039863
7349,0.933008,0.786085,0.249145,-0.236950,-0.249134,-0.216004,-0.907561,-0.779133,0.919810,0.854641,-0.757269,-0.815380,-0.158701,0.040811,0.024684,0.448116,0.043999,-0.345455,-0.411072,-0.026805
7350,0.933008,0.792538,0.246432,-0.233230,-0.244267,-0.210542,-0.910648,-0.785181,0.922323,0.855988,-0.780800,-0.822905,-0.185720,0.025339,0.024684,0.404027,0.101702,-0.387107,-0.444878,-0.004984


In [40]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')


clf = SVC(gamma=0.001,C=100)
clf.fit(x_train_10,y_train)
pred = clf.predict(x_test[feature_selection_name])
acc = accuracy_score(y_test,pred)

print(acc)

pram = {'C':[1,10,100],
       'gamma':[0.1,0.01,0.001]}
grid = GridSearchCV(clf,param_grid=pram,cv=2,scoring="accuracy")
grid.fit(x_train_10,y_train)
best_clf = grid.best_estimator_
best_pred = best_clf.predict(x_test[feature_selection_name])
best_acc = accuracy_score(y_test,best_pred)

print(best_acc)

0.8686800135731252
0.8686800135731252
