<a href="https://colab.research.google.com/github/yk-Jeong/TIL/blob/main/TfidfVectorizer_%2B_LogisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data load 

In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!unzip /content/drive/MyDrive/daicon/predict/open.zip

Archive:  /content/drive/MyDrive/daicon/predict/open.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [4]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

## Data Preprocessing

In [5]:
train.head()

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1


In [6]:
test.head()

Unnamed: 0,ID,first_party,second_party,facts
0,TEST_0000,Salerno,United States,The 1984 Bail Reform Act allowed the federal c...
1,TEST_0001,Milberg Weiss Bershad Hynes and Lerach,"Lexecon, Inc.",Lexecon Inc. was a defendant in a class action...
2,TEST_0002,No. 07-582\t Title: \t Federal Communications ...,"Fox Television Stations, Inc., et al.","In 2002 and 2003, Fox Television Stations broa..."
3,TEST_0003,Harold Kaufman,United States,During his trial for armed robbery of a federa...
4,TEST_0004,Berger,Hanlon,"In 1993, a magistrate judge issued a warrant a..."


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

def get_vector(vectorizer, df, train_mode):
    if train_mode:
        X_facts = vectorizer.fit_transform(df['facts'])
    else:
        X_facts = vectorizer.transform(df['facts'])
    X_party1 = vectorizer.transform(df['first_party'])
    X_party2 = vectorizer.transform(df['second_party'])
    
    X = np.concatenate([X_party1.todense(), X_party2.todense(), X_facts.todense()], axis=1)
    return X

In [8]:
X = get_vector(vectorizer, train, True)
y = train["first_party_winner"]
X_test = get_vector(vectorizer, test, False)

## Valid set split 

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [10]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((1982, 52377), (496, 52377), (1982,), (496,))

## Define Model & Train

### Random Forest 

In [11]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

TypeError: ignored

### Logistic Regression 

In [12]:
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)
X_valid = np.asarray(X_valid)

In [25]:
from sklearn.model_selection import GridSearchCV

params = {
    'fit_intercept': [True], 
    'intercept_scaling': [1], 
    'class_weight': [None],
    'random_state': [None],   
    'multi_class': ['auto'], 
    'verbose': [1], 
    'warm_start': [False],
    'n_jobs': [None], 
    'l1_ratio': [None],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'],
    'penalty': ['l1', 'l2', 'elasticnet'], 
    'C': [0.01, 0.1, 1, 5, 10], 
    'max_iter': [100, 200, 500, 1000],
    'tol':[1e-4],
    'warm_start':[True]
    }

In [26]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=10, max_iter=10000)
lr.fit(X_train, y_train)
pred = lr.predict(X_test)

#lrc_grid = GridSearchCV(lr, param_grid = params, scoring='roc_auc', cv = 5)
#lrc_grid.fit(X_train, y_train)

#print('최적 하이퍼 파라미터:', lrc_grid.best_params_, '\nROC AUC Score:', round(lrc_grid.best_score_, 4))

### Hyperparameters search: GridSearch

### XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

xgb_clf = XGBClassifier(n_estimators=100, random_state=0)
xgb_clf.fit(X_train, y_train, early_stopping_rounds=30,
            eval_metric="auc", eval_set=[(X_valid, y_valid)])

### LightGBM

In [42]:
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)
X_valid = np.asarray(X_valid)

y_train = np.asarray(y_train)
y_valid = np.asarray(y_valid)

In [None]:
from lightgbm import LGBMClassifier
evals = [(X_valid, y_valid)]
lgbm_clf = LGBMClassifier(n_estimators=500) 
lgbm_clf.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="auc", eval_set=evals, verbose=True)

## Hyperparameter tunning: Optuna 

In [None]:
import optuna 

## Evaluation and Validation 

In [27]:
lr.score(X_train, y_train), lr.score(X_valid, y_valid)

(1.0, 0.6370967741935484)

In [None]:
from sklearn.metrics import roc 

## Inference & Submission

In [22]:
submit = pd.read_csv('./sample_submission.csv')

In [28]:
pred = lr.predict(X_test)

In [29]:
submit['first_party_winner'] = pred
submit.to_csv('./submit_lr_0607.csv', index=False)
print('Done')

Done
