In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import re
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [4]:
train_r = '/content/drive/MyDrive/Colab Notebooks/dacon/law contest/data/train.csv'
test_r= '/content/drive/MyDrive/Colab Notebooks/dacon/law contest/data/test.csv'
#temp_r=''

In [5]:
train = pd.read_csv(train_r)
test = pd.read_csv(test_r)

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2478 entries, 0 to 2477
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  2478 non-null   object
 1   first_party         2478 non-null   object
 2   second_party        2478 non-null   object
 3   facts               2478 non-null   object
 4   first_party_winner  2478 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 96.9+ KB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1240 entries, 0 to 1239
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            1240 non-null   object
 1   first_party   1240 non-null   object
 2   second_party  1240 non-null   object
 3   facts         1240 non-null   object
dtypes: object(4)
memory usage: 38.9+ KB


In [None]:
vectorizer = TfidfVectorizer()
def get_vector(vectorizer, df, train_mode):
    if train_mode:
        X_facts = vectorizer.fit_transform(df['facts'])
    else:
        X_facts = vectorizer.transform(df['facts'])
    X_party1 = vectorizer.transform(df['first_party'])
    X_party2 = vectorizer.transform(df['second_party'])

    X = np.concatenate([X_party1.todense(), X_party2.todense(), X_facts.todense()], axis=1)
    return X

In [None]:
X_train = get_vector(vectorizer, train, True)
Y_train = train["first_party_winner"]
X_test = get_vector(vectorizer, test, False)

In [None]:
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes=MultinomialNB(alpha=1, fit_prior=True) #alpha(라플라스 스무딩 파라미터), fit_prior(사전확률)
naive_bayes.fit(X_train, Y_train)

In [None]:
y_pred=naive_bayes.predict(X_test) #에측 결과, spam인지 ham인지
print(y_pred[:20])

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [None]:
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dacon/law contest/data/sample_submission.csv')

In [None]:
submit['first_party_winner'] = y_pred

In [None]:
submit.to_csv('./basenaive_submit.csv', index=False)
print('Done')

Done


In [None]:
print(naive_bayes.score(X_train, Y_train)) # 훈련 데이터에 대한 나이브 베이즈 모델 정확도

0.6824051654560129


In [None]:
np.set_printoptions(precision=5, suppress=True)

x_pred_proba=naive_bayes.predict_proba(X_train) # [ham일 확률(0) , spam일 확률(1)]
x_pred_proba[:5]

array([[0.00713, 0.99287],
       [0.06507, 0.93493],
       [0.00136, 0.99864],
       [0.3447 , 0.6553 ],
       [0.00267, 0.99733]])

In [None]:
np.set_printoptions(precision=5, suppress=True)

x_pred_proba=naive_bayes.predict_proba(X_test) # [ham일 확률(0) , spam일 확률(1)]
x_pred_proba[:5]

array([[0.1028 , 0.8972 ],
       [0.01796, 0.98204],
       [0.01581, 0.98419],
       [0.01128, 0.98872],
       [0.02691, 0.97309]])

In [None]:
from sklearn.model_selection import GridSearchCV

parameters= {
    "alpha" : [0.5,0.8, 1.0,1.2, 1.5,1.8, 2.0], # 스무딩 파라미터 본래는 1이었음
    "fit_prior": [True, False] # 학습 데이터 사전 확률 적용여부
}

In [None]:
grid_search= GridSearchCV(naive_bayes, parameters, n_jobs=-1, cv=10, scoring="roc_auc")
#n_jobs=-1 모든 코어 사용, default 값은 1, cv(cross validation,교차검증)=10겹
%time grid_search.fit(X_train, Y_train)

CPU times: user 2.82 s, sys: 1.65 s, total: 4.47 s
Wall time: 2min 56s


In [None]:
grid_search.best_params_

{'alpha': 0.5, 'fit_prior': True}

In [None]:
naive_bayes_best=grid_search.best_estimator_ #best parameter 반영
try_pred=naive_bayes_best.predict(X_train)
try_pred

array([1, 1, 1, ..., 1, 1, 0])

In [None]:
naive_bayes_best.score(X_train, Y_train)

0.7719935431799838

In [None]:
y_pred_proba=naive_bayes_best.predict_proba(X_train)
y_pred_proba[:5]

array([[0.00237, 0.99763],
       [0.11498, 0.88502],
       [0.00054, 0.99946],
       [0.59396, 0.40604],
       [0.00149, 0.99851]])

In [None]:
naive_bayes_best=grid_search.best_estimator_ #best parameter 반영
y_pred=naive_bayes_best.predict(X_test)
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dacon/law contest/data/sample_submission.csv')

In [None]:
submit['first_party_winner'] = y_pred

In [None]:
submit.to_csv('./gridnive_submit.csv', index=False)
print('Done')

Done


In [None]:
from sklearn.ensemble import RandomForestClassifier

# 랜덤포레스트 분류기를 사용
forest = RandomForestClassifier(
    n_estimators = 100, max_depth=20, n_jobs = -1,#모든 코어 사용 -1
    random_state=50 #파라미터 튜닝을 위해 회차마다 결과 동일하게 만듬
    )
forest

In [None]:
from sklearn.metrics import accuracy_score

%time forest = forest.fit(term_docs_train ,y_train) #행렬 데이터,벡터 데이터
y_pred = forest.predict(term_docs_train)

random_a_n = accuracy_score(y_train,y_pred)
print('RandomForest_train_accuracy score = ',random_a_n)

In [None]:
y_pred = forest.predict(term_docs_test)

y_pred_proba=forest.predict_proba(term_docs_test)
y_pred_proba[:5]


In [None]:
random_a_t = accuracy_score(y_test,y_pred)

print('RandomForest_test_accuracy score = ',random_a_t)

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate

%time np.mean(cross_val_score(forest, term_docs_train, y_train, cv=10,scoring='roc_auc'))#cross validation #roc 커브 사용

In [None]:
parameters= {
    "max_depth" : [20,30,40],
    "n_estimators": [150,200,250,300]
}

In [None]:
grid_search= GridSearchCV(forest, parameters, n_jobs=-1, cv=10, scoring="roc_auc")
%time grid_search.fit(term_docs_train, y_train)
grid_search.best_params_

In [None]:
randomforest_best=grid_search.best_estimator_ #best parameter 반영
y_pred=randomforest_best.predict(term_docs_test)
randomforest_best.score(term_docs_test, y_pred) #gridsearchCV를 반영한 결과

In [None]:
y_pred_proba=randomforest_best.predict_proba(term_docs_test) # [ham일 확률(0) , spam일 확률(1)]
y_pred_proba[:5]

In [None]:
cm2=confusion_matrix(y_test, y_pred)
cmdf2=DataFrame(cm1, index=['실제 ham','실제 spam'], columns=['예측 ham','예측 spam'])
cmdf2

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:,1]) # spam일 확률 사용, _는 threshold(임계값)생략
auc=roc_auc_score(y_test, y_pred_proba[:,1])
plt.plot(fpr, tpr, "r-", label="RandomForest")
plt.plot([0,1], [0,1], "b--", label="random guess")
plt.xlabel=("false positive rate")
plt.ylabel=("true positive rate")
plt.title=("ROC Curve: AUC={:.5f}".format(auc))
plt.legend(loc="lower right");

%time for_auc=np.mean(cross_val_score(forest, term_docs_test, y_test, cv=10,scoring='roc_auc'))#cross validation #roc 커브 사용