<a href="https://colab.research.google.com/github/jg116907/NLPstudy/blob/master/Text_data_classification_eng_modeling_lgs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 모델링 소개
  1. 로지스틱 회귀 모델
    - 주로 이항 분류를 위해 사용됨
    - TF-IDF와 word2vec을 이용한 벡터화를 활용

In [0]:
import pandas as pd

## TF-IDF를 활용한 로지스틱 회귀 분석

In [0]:
# TF-IDF를 활용한 모델 구현
train_data = pd.read_csv("train_clean.csv",header=0)

reviews = list(train_data["review"])
sentiments = list(train_data["sentiment"])

In [0]:
# TF-IDF 벡터화
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3), max_features=5000)
# analyzer -> word : 단어 하나 단위, char : 문자 하나 단위
X = vectorizer.fit_transform(reviews)

In [0]:
# train/test set 분리
from sklearn.model_selection import train_test_split
import numpy as np

RANDOM_SEED = 42
TEST_SPLIT = 0.2

y = np.array(sentiments)

X_train,X_eval,y_train,y_eval = train_test_split(X,y,test_size=TEST_SPLIT,random_state=RANDOM_SEED)

In [0]:
# 모델 선언 및 학습
from sklearn.linear_model import LogisticRegression

lgs = LogisticRegression(class_weight='balanced') # 각 라벨에 대해 균형있게 학습
lgs.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
# 검증 데이터로 성능 평가
print("Accuracy : ", lgs.score(X_eval,y_eval))
# 다양한 성능 평가 척도 : accuracy(정확도), precision(정밀도), recall(재현율), f1-score, auc

Accuracy :  0.8596


In [0]:
# test data 로드
test_data = pd.read_csv("test_clean.csv")

# test data 벡터화
testDataVecs = vectorizer.transform(test_data['review'])

# 예측값
test_predicted = lgs.predict(testDataVecs)
print(test_predicted)

[1 0 1 ... 0 1 0]


In [0]:
ids = list(test_data['id'])
answer_dataset = pd.DataFrame({'id':ids, 'sentiment':test_predicted})
answer_dataset.to_csv('lgs_tfidf_answer.csv',index=False,quoting=3)

In [0]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
# 자신의 kaggle account에서 api key 생성 후 kaggle.json 업로드
!chmod 600 ~/.kaggle/kaggle.json

In [0]:
# 케글 제출
!kaggle competitions submit -c word2vec-nlp-tutorial -f lgs_tfidf_answer.csv -m "logistic regression with tf-idf vectorizer"

100% 276k/276k [00:03<00:00, 74.8kB/s]
Successfully submitted to Bag of Words Meets Bags of Popcorn

## Word2vec을 활용한 로지스틱 회귀 분석

In [0]:
train_data = pd.read_csv("train_clean.csv")

reviews = list(train_data["review"])
sentiments = list(train_data["sentiment"])

sentences = []
for review in reviews:
  sentences.append(review.split()) # word2vec을 사용하기 위해서는 입력 값을 단어로 구분된 리스트로 만들어 줘야 함

In [0]:
#학습 시 필요한 하이퍼 파라미터
num_features = 300 # word vector의 feature 수
min_word_count = 40 # 단어에 대한 최소 빈도 수
num_workers = 4 # 프로세스 개수
context = 10 # context window 크기
downsampling = 1e-3 # 다운 샘플링 비율

In [0]:
!pip install gensim

In [0]:
# 학습하는 과정에서 진행 상황을 확인하기 위한 logging
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [0]:
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count=min_word_count, window=context, sample=downsampling)

In [0]:
# 모델 저장
model_name = "300features_40minwords_10context"
model.save(model_name)

2019-10-21 07:23:30,950 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2019-10-21 07:23:30,952 : INFO : not storing attribute vectors_norm
2019-10-21 07:23:30,958 : INFO : not storing attribute cum_table
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-10-21 07:23:31,264 : INFO : saved 300features_40minwords_10context


In [0]:
# 모든 벡터의 개수를 통일 시키기 위해 리뷰 하나당 벡터값을 평균낸다
def get_features(words,model,num_features):
  # 출력 벡터 초기화
  feature_vector = np.zeros((num_features),dtype=np.float32)
  
  num_words = 0
  # 어휘사전 준비
  index2word_set = set(model.wv.index2word)
  
  for w in words:
    if w in index2word_set:
      num_words+=1
      # 사전에 해당하는 단어에 대해 단어 벡터를 더함
      feature_vector = np.add(feature_vector,model[w])
  feature_vector = np.divide(feature_vector, num_words)
  return feature_vector      

In [0]:
def get_dataset(reviews,model,num_features):
  dataset = list()
  for s in reviews:
    dataset.append(get_features(s,model,num_features))
  reviewFeatureVecs = np.stack(dataset)
  
  return reviewFeatureVecs

In [0]:
test_data_vecs = get_dataset(sentences,model,num_features)

  del sys.path[0]


In [0]:
# train/test set 분리
from sklearn.model_selection import train_test_split

X = test_data_vecs
y = np.array(sentiments)

RANDOM_SEED = 42
TEST_SPLIT = 0.2

X_train,X_eval,y_train,y_eval = train_test_split(X,y,test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [0]:
# 모델 선언 및 학습
from sklearn.linear_model import LogisticRegression

lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
# 성능 평가
print("Accuracy : ",lgs.score(X_eval,y_eval))

Accuracy :  0.8626


In [0]:
# 데이터 제출
test_data = pd.read_csv("test_clean.csv")

In [0]:
test_ids = list(test_data["id"])
test_reviews = list(test_data["review"])
test_sentences = []
for review in test_reviews:
  test_sentences.append(review.split())

In [0]:
test_data_vecs = get_dataset(test_sentences,model,num_features)

  del sys.path[0]


In [0]:
test_predicted = lgs.predict(test_data_vecs)
answer_dataset = pd.DataFrame({"id":ids,"sentiment":test_predicted})
answer_dataset.to_csv("lgs_answer.csv",index=False,quoting=3)

In [79]:
!kaggle competitions submit -c word2vec-nlp-tutorial -f lgs_answer.csv -m "logistic regression with word2ved"

100% 276k/276k [00:03<00:00, 75.8kB/s]
Successfully submitted to Bag of Words Meets Bags of Popcorn