In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb


In [3]:
data = pd.read_csv('train.csv')
X_data = data.iloc[:,:-1]
label = data.iloc[:,-1]

In [12]:
data.shape

(6940, 94)

In [4]:
label.value_counts()

1    3756
0    3184
Name: label, dtype: int64

In [5]:
test_data = pd.read_csv('test.csv').iloc[:,1:]
test_data.shape

(1735, 93)

### 전처리

In [6]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_data_smote, label_smote = smote.fit_resample(X_data,label)

# SMOTE 적용 후 데이터셋 
print("원본 데이터셋 클래스 분포:")
print(label.value_counts())

print("\nSMOTE 적용 후 데이터셋 클래스 분포:")
print(label_smote.value_counts())

원본 데이터셋 클래스 분포:
1    3756
0    3184
Name: label, dtype: int64

SMOTE 적용 후 데이터셋 클래스 분포:
0    3756
1    3756
Name: label, dtype: int64


In [43]:
from sklearn.preprocessing import RobustScaler
SC = RobustScaler()
train_x = SC.fit_transform(X_data_smote)
test_x = SC.transform(test_data)

In [44]:
train_x_df = pd.DataFrame(train_x,columns=X_data_smote.columns)
test_x_df = pd.DataFrame(test_x,columns=test_data.columns)

In [45]:
X_train, X_test,y_train, y_test = train_test_split(train_x_df,label_smote,test_size=0.2)

### 모델 학습

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score

f_model = xgb.XGBClassifier()
svm = SVC()
LR = LogisticRegression(solver='liblinear')
rf = RandomForestClassifier(n_estimators=7)
xg = xgb.XGBClassifier()

# RFECV를 사용하여 특성 선택과 교차 검증 수행
rfecv = RFECV(estimator=f_model)

In [63]:
X_selected = rfecv.fit_transform(X_train, y_train)

test_selected = rfecv.transform(X_test)

svm.fit(X_selected,y_train)
LR.fit(X_selected, y_train)
rf.fit(X_selected, y_train)
xg.fit(X_selected, y_train)
y_pred_xg = xg.predict(test_selected)


print(f'svm acc:{svm.score(test_selected,y_test)}')
print(f'LR acc:{LR.score(test_selected,y_test)}')
print(f'rf acc:{rf.score(test_selected,y_test)}')
print(f'xg acc:{accuracy_score(y_test,y_pred_xg)}')

svm acc:0.7236024844720497
LR acc:0.717391304347826
rf acc:0.6708074534161491
xg acc:0.7284826974267968


In [72]:
svm.fit(X_selected,y_train)
y_pred_svm = xg.predict(test_selected)

In [67]:
from sklearn.metrics import recall_score

print(f'svm recall:{recall_score(y_test, y_pred_svm)}')
print(f'xg recall:{recall_score(y_test, y_pred_xg)}')

svm recall:0.7031802120141343
xg recall:0.7093639575971732


In [74]:
test_selected = rfecv.transform(test_x_df)
y_pred = xg.predict(test_selected)

In [68]:
rfecv.n_features_

54

In [25]:
X_data.iloc[:,rfecv.support_].columns

Index(['WC', 'Authentic', 'Tone', 'WPS', 'Sixltr', 'Dic', 'function', 'ppron',
       'i', 'we', 'they', 'article', 'auxverb', 'adverb', 'conj', 'negate',
       'number', 'affect', 'posemo', 'anx', 'anger', 'sad', 'family',
       'insight', 'cause', 'discrep', 'tentat', 'percept', 'feel', 'bio',
       'health', 'sexual', 'ingest', 'drives', 'affiliation', 'achieve',
       'power', 'focuspast', 'space', 'time', 'work', 'leisure', 'home',
       'money', 'relig', 'swear', 'netspeak', 'nonflu', 'AllPunc', 'QMark',
       'Exclam', 'Dash', 'Apostro', 'OtherP'],
      dtype='object')

In [65]:
y_pred

array([0, 1, 1, ..., 0, 1, 0])


- sub31 -> 879,856 -> Smote -> SVC => robust scaler gamma = 0.01
- sub32 -> 872,863 -> Smote -> SVC => robust scaler gamma = 0.01  / RFECV 사용
- sub 34 -> 876, 869 -> Smote -> lr => robust scaler / RFECV 사용
- sub 35 -> 871, 864 -> smote -> svc -> minmax //RFECV 사용
- sub 37 -> 875, 860 -> smote -> >svc / RFECV 사용

In [75]:
sub = pd.DataFrame({'id':[x for x in range(test_data.shape[0])],'label':y_pred})
sub['label'].value_counts() 

1    878
0    857
Name: label, dtype: int64

In [76]:
sub.to_csv('sub39.csv',index=False)