In [24]:
import os
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [25]:
# 데이터 불러오기
train = pd.read_csv('./train/train.csv')
test = pd.read_csv('./test/test.csv')

# 결측치 처리
train_desc = train.Description.fillna("none").values
test_desc = test.Description.fillna("none").values

In [37]:
# TFIDF 설정
tfv = TfidfVectorizer()

# Fit TFIDF (훈련)
tfv.fit(list(train_desc)) # Learn vocabulary and idf from training set.

# 변환
X =  tfv.transform(train_desc) # Transform documents to document-term matrix.
X_test = tfv.transform(test_desc)

print("train.. 문장 : {:>6,}, 단어 : {:,}" .format(X.shape[0], X.shape[1]))
print("test..  문장 : {:>6,}, 단어 : {:,}" .format(X_test.shape[0], X_test.shape[1]))

train.. 문장 : 14,993, 단어 : 21,209
test..  문장 :  3,948, 단어 : 21,209


In [38]:
# Truncated SVD 설정

TruncatedSVD_k = 10
svd = TruncatedSVD(n_components=TruncatedSVD_k)

# Fit Truncated SVD (훈련)
svd.fit(X)

# 변환
X = svd.transform(X)
X_test = svd.transform(X_test)

print(X.shape)
print(X_test.shape)
print("설명 가능한  분산 : {:.2f}%" .format(svd.explained_variance_ratio_.sum()*100))

(14993, 10)
(3948, 10)
설명 가능한  분산 : 10.25%


In [39]:
# 데이터 붙이기
columns_name = ['svd_{}'.format(i) for i in range(TruncatedSVD_k)]
X = pd.DataFrame(X, columns=columns_name)
train = pd.concat((train, X), axis=1)

X_test = pd.DataFrame(X_test, columns=columns_name)
test = pd.concat((test, X_test), axis=1)

print("train:", train.shape)
print("test:", test.shape)

train: (14993, 34)
test: (3948, 33)
