In [31]:
import os
current_path = os.getcwd()
print(current_path)

# Google Drive 를 colab 에 연결해서 데이터 로딩 (mounting)
# 경로확인 후
# 1. data_in 파일 생성
# 2. 캐글 데이터셋 https://www.kaggle.com/competitions/word2vec-nlp-tutorial/data 다운후 드라이브 업로드
# 3. DATA_IN_PATH 에 추가

from google.colab import drive
drive.mount('/content/drive')

/content
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
import zipfile
DATA_IN_PATH = '/content/drive/MyDrive/DeepLearning_NLP/data_in/'
file_list = ['labeledTrainData.tsv.zip', 'unlabeledTrainData.tsv.zip', 'testData.tsv.zip']

for file in file_list:
  zipRef = zipfile.ZipFile(DATA_IN_PATH + file, 'r')
  zipRef.extractall(DATA_IN_PATH)
  zipRef.close()

In [33]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
# sentiment -> 감정

print('File Size: ')
for file in os.listdir(DATA_IN_PATH):
  if 'tsv' in file and 'zip' not in file:
    print(file.ljust(30) + str(round(os.path.getsize(DATA_IN_PATH + file) / 1000000, 2)) + 'MB')
  
train_data = pd.read_csv(DATA_IN_PATH + 'labeledTrainData.tsv', header = 0, delimiter ='\t', quoting = 3) # quoting =3 쌍따음표 제거
print(train_data.head()) 
print()
print('The number of entire training data: {}'.format(len(train_data)))
train_length = train_data['review'].apply(len)
train_length.head() # 각 리뷰들의 길이

File Size: 
labeledTrainData.tsv          33.56MB
unlabeledTrainData.tsv        67.28MB
testData.tsv                  32.72MB
         id  sentiment                                             review
0  "5814_8"          1  "With all this stuff going down at the moment ...
1  "2381_9"          1  "\"The Classic War of the Worlds\" by Timothy ...
2  "7759_3"          0  "The film starts with a manager (Nicholas Bell...
3  "3630_4"          0  "It must be assumed that those who praised thi...
4  "9495_8"          1  "Superbly trashy and wondrously unpretentious ...

The number of entire training data: 25000


0    2304
1     948
2    2451
3    2247
4    2233
Name: review, dtype: int64

In [34]:
# 전처리

import re 
import json 
import pandas as pd 
import numpy as np 
import nltk 
from bs4 import BeautifulSoup 
from nltk.corpus import stopwords 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.preprocessing.text import Tokenizer

DATA_IN_PATH = '/content/drive/MyDrive/DeepLearning_NLP/data_in/'
train_data = pd.read_csv(DATA_IN_PATH + 'labeledTrainData.tsv', header = 0, delimiter ='\t', quoting = 3)

review = train_data['review'][0] # 첫번째 리뷰를 가져옴
print(review)

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [35]:
# 하나의 문장에 대한 전처리
review_text = BeautifulSoup(review, "html5lib").get_text() # HTML 태그 제거, 텍스트만 가져와라
review_text = re.sub("[^a-zA-Z]", " ", review_text) # 영어 문자를 제외한 나머지는 모두 공백 regular expression (substitution, ^ <- 여집합(나머지))
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) # 영어 불용어 집합 구성
review_text =review_text.lower() # 소문자로
words = review_text.split() # 소문자 변환 후 단어마다 나눠서 단어 리스트로 만듦
words = [w for w in words if not w in stop_words] # 불용어를 제거한 리스트를 구성함

print(words)
clean_review = ' '.join(words) # 단어 리스트들을 다시 하나의 글로 합침
print(clean_review)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent', 'moonwalker', 'part', 'biography', 'part', 'feature', 'film', 'remember', 'going', 'see', 'cinema', 'originally', 'released', 'subtle', 'messages', 'mj', 'feeling', 'towards', 'press', 'also', 'obvious', 'message', 'drugs', 'bad', 'kay', 'visually', 'impressive', 'course', 'michael', 'jackson', 'unless', 'remotely', 'like', 'mj', 'anyway', 'going', 'hate', 'find', 'boring', 'may', 'call', 'mj', 'egotist', 'consenting', 'making', 'movie', 'mj', 'fans', 'would', 'say', 'made', 'fans', 'true', 'really', 'nice', 'actual', 'feature', 'film', 'bit', 'finally', 'starts', 'minutes', 'e

In [36]:
# 문장처리 함수로

def preprocessing(review, remove_stopwords = False):
  review_text = BeautifulSoup(review, "html5lib").get_text()
  review_text = re.sub("[^a-zA-Z]", " ", review_text)
  review_text = review_text.lower()
  if remove_stopwords:
    words = review_text.split()
    stops = set(stopwords.words("english"))
    words = [w for w in words if not w in stops]
    review_text = ' '.join(words)
  else : # 불용어를 제거하지 않는 경우
    review_text = ' '.join(words)
  return review_text


clean_train_reviews = []
for review in train_data['review']:
  clean_train_reviews.append(preprocessing(review, remove_stopwords = True))
clean_train_df = pd.DataFrame({'review': clean_train_reviews,
                               'sentiment': train_data['sentiment']})


In [None]:
clean_train_df.head()

Unnamed: 0,review,sentiment
0,stuff going moment mj started listening music ...,1
1,classic war worlds timothy hines entertaining ...,1
2,film starts manager nicholas bell giving welco...,0
3,must assumed praised film greatest filmed oper...,0
4,superbly trashy wondrously unpretentious explo...,1


In [None]:
print(stop_words)

{'he', 'were', 'or', 'a', 'only', 'all', 'no', 'did', 'had', 've', 'hasn', 'between', 'than', 'be', "doesn't", 'does', 'where', 'why', 'out', 'not', 'before', "you'd", 'as', 'mustn', "mustn't", 'above', 'both', 'who', "hadn't", 'herself', "shouldn't", 'they', "that'll", 'my', 'most', 'has', 'wasn', "you'll", 'after', 'you', 'myself', 'doing', "isn't", 'our', 'again', 'few', 'm', 'o', 'his', 'do', 'against', 'having', "you're", 'shan', 'and', 'any', 'that', 'by', 'ma', 'very', 'too', 'each', "didn't", 'once', 'hers', 'over', "won't", 'she', "couldn't", 'didn', "wouldn't", 'into', 'yourself', 'am', 'doesn', 'mightn', 'was', 'been', 'haven', 'there', 'yourselves', 'other', 'now', 'ain', 'll', 'ourselves', 'on', 'them', 'this', "it's", 'will', 'just', 'themselves', 'hadn', 'below', 'weren', 'needn', 'theirs', 'can', 'in', "aren't", "shan't", 'down', 'which', 'further', 'until', 'd', 'have', 'yours', 'about', 'because', 'y', 'its', 'how', "she's", "wasn't", 'these', 'with', 'couldn', 'to', 

In [37]:
# TFIDF vectorizer 하지 않았을때 사용할 수 있는법 즉, tfidf 쓰면 이건 필요없다!
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train_reviews)
text_sequences = tokenizer.texts_to_sequences(clean_train_reviews)

MAX_SEQUENCE_LENGTH = 174 # 임의 세팅
train_inputs = pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
print('Train Data:', train_inputs.shape)

train_labels = np.array(train_data['sentiment'])
print('Label:', train_labels.shape)

Train Data: (25000, 174)
Label: (25000,)


In [38]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
RANDOM_SEED = 42
TEST_SPLIT = 0.2

reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer='char', sublinear_tf=True, ngram_range=(1,3), max_features=5000) # 세팅안하고 빈칸으로 해도댐 그러나 주로 max_features는 해주는경우 많음
X = vectorizer.fit_transform(reviews) # 인공신경망의 입력으로
y = np.array(sentiments)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=TEST_SPLIT, random_state =RANDOM_SEED)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train) # predicted = lgs.predict(X_test)
print("Accuracy: %f" % lgs.score(X_test, y_test))

(20000, 5000) (5000, 5000) (20000,) (5000,)
Accuracy: 0.869600


In [None]:
from keras.models import Sequential
from keras.layers import Dense

X_train1 = X_train.toarray()
X_test1 = X_test.toarray()
# Model, Cost, Train
model = Sequential()
model.add(Dense(5000, activation='relu')) # HiddenLayer
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy']) # sgd 써도댐 adam대신
model.fit(X_train1, y_train, epochs=200, verbose =1)
# Testing
_, accuracy = model.evaluate(X_test1, y_test)
print("Accuracy:", accuracy)
model.summary()