In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('./데이터/data.csv')

# 채점 모델 만들기

## 데이터셋 분리

In [3]:
from sklearn.model_selection import train_test_split

# train, test 데이터셋 분리
X = data[data.columns[:-1]]
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)

## 모델 생성

In [4]:
from sklearn.metrics import accuracy_score

### 1. SVC

In [5]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# # 정규화 작업
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)

# SVM 모델 생성
sv_clf = SVC(kernel='poly', C = 3, degree = 3, probability=True)
sv_clf.fit(X_train, y_train)

# # test 데이터셋도 정규화(train 데이터셋 기준으로 학습시킨 정규화 모듈 사용)
# X_test = scaler.transform(X_test)

svm_pred = sv_clf.predict(X_test) # 예측 라벨
accuracy_score(y_test, svm_pred)

0.7109144542772862

### 2. RandomForest

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics    
 
# 학습 진행
forest = RandomForestClassifier(n_estimators=100)
forest.fit(X_train, y_train)

# 예측
rf_pred = forest.predict(X_test)

# 정확도 확인
print('정확도 :', metrics.accuracy_score(y_test, rf_pred))

정확도 : 0.7846607669616519


### 3. 로지스틱 회귀

In [7]:
from sklearn.linear_model import LogisticRegression

logit_clf = LogisticRegression()
logit_clf.fit(X_train, y_train)

lr_pred = logit_clf.predict(X_test)
accuracy_score(y_test, lr_pred)

0.7669616519174042

### 4. XGBoost

In [8]:
import xgboost as xgb

# XGBoost 모델 초기화 및 학습
model = xgb.XGBClassifier(objective="multi:softmax", num_class=3)  # 다중 클래스 분류를 위한 설정
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = model.predict(X_test)

# 정확도 평가
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7728613569321534


In [13]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, svm_pred, labels=[0, 1]))
print(confusion_matrix(y_test, rf_pred, labels=[0, 1]))
print(confusion_matrix(y_test, lr_pred, labels=[0, 1]))
print(confusion_matrix(y_test, y_pred, labels=[0, 1]))

[[108  77]
 [ 21 133]]
[[158  27]
 [ 46 108]]
[[148  37]
 [ 42 112]]
[[146  39]
 [ 38 116]]


### 5. 딥러닝

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
import numpy as np

# 신경망 모델 구축
model = Sequential()
model.add(Dense(32, input_dim=12, activation='relu'))
model.add(Dense(1, activation='softmax'))

# 모델 컴파일
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# 모델 학습
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# 모델 평가
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

# 벡터의 유사도 계산

In [4]:
import pandas as pd

In [5]:
bts_dy = pd.read_csv('./데이터/label_bts_dynamite.csv')

In [17]:
a = bts_dy[bts_dy['label'] == 1].iloc[0, :-1]
b = bts_dy[bts_dy['label'] == 0].iloc[0, :]

In [21]:
target_col = [i for i in bts_dy.columns if 'target' in i]
user_col = [i for i in bts_dy.columns if 'user' in i]

In [29]:
target_x_col = [i for i in target_col if 'x' in i]
target_y_col = [i for i in target_col if 'y' in i]
target_z_col = [i for i in target_col if 'z' in i]

user_x_col = [i for i in user_col if 'x' in i]
user_y_col = [i for i in user_col if 'y' in i]
user_z_col = [i for i in user_col if 'z' in i]

In [32]:
target_x_t = a[target_x_col]
target_y_t = a[target_y_col]
target_z_t = a[target_z_col]
user_x_t = a[user_x_col]
user_y_t = a[user_y_col]
user_z_t = a[user_z_col]

target_x_f = b[target_x_col]
target_y_f = b[target_y_col]
target_z_f = b[target_z_col]
user_x_f = b[user_x_col]
user_y_f = b[user_y_col]
user_z_f = b[user_z_col]

## 유클리드 거리

In [42]:
from scipy.spatial import distance
import numpy as np

In [71]:
# 각각 구해서 평균
ud_x_t = distance.euclidean(target_x_t, user_x_t)
ud_y_t = distance.euclidean(target_y_t, user_y_t)
ud_z_t = distance.euclidean(target_z_t, user_z_t)

round(1/(1+(ud_x_t + ud_y_t + ud_z_t)/3), 4) * 100

74.78

In [69]:
ud_x_f = distance.euclidean(target_x_f, user_x_f)
ud_y_f = distance.euclidean(target_y_f, user_y_f)
ud_z_f = distance.euclidean(target_z_f, user_z_f)

round(1/(1+(ud_x_f + ud_y_f + ud_z_f)/3), 4) * 100

37.89

In [68]:
# 한꺼번에
test = distance.euclidean(a[target_col], a[user_col])
round(1/(1+test) * 100, 2)

62.96

In [70]:
test2 = distance.euclidean(b[target_col], b[user_col])
round(1/(1+test2) * 100, 2)

24.15