IMDB 데이터셋

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import sys
sys.path.append('C:/Users/User/Desktop/IMDB')
from data_preprocessing import *

In [2]:
TRAIN_DATA_PATH = "C:/Users/user/Desktop/bilm-tf-master/textdataset/IMDB/aclImdb/train"
TEST_DATA_PATH = "C:/Users/user/Desktop/bilm-tf-master/textdataset/IMDB/aclImdb/test"

def read_text_file(path):
    labels = ['neg','pos']
    if os.path.exists(path):
        text=[]
        text_label =[]
        for directory_name in os.listdir(path):
            if directory_name in labels:
                label_index = labels.index(directory_name)
                data_path = os.path.join(path,directory_name)
                for file in os.listdir(data_path):
                    with open(os.path.join(data_path,file),'r', encoding='utf-8') as f:
                        text.append(f.read())
                        text_label.append(label_index)
        return pd.DataFrame(text,columns =['texts']),pd.DataFrame(text_label,columns =['label'])
    
x_train,y_train = read_text_file(TRAIN_DATA_PATH) 
x_test,y_test = read_text_file(TEST_DATA_PATH) 

# 전처리

In [3]:
train = pd.concat([x_train, y_train], axis=1)
test = pd.concat([x_test, y_test], axis=1)

train.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)

In [4]:
train[['texts']] = train[['texts']].applymap(lambda x:remove_punctuation(x))
train[['texts']] = train[['texts']].applymap(lambda x:x.lower())
test[['texts']] = test[['texts']].applymap(lambda x:remove_punctuation(x))
test[['texts']] = test[['texts']].applymap(lambda x:x.lower())

In [5]:
X_train = train.drop(columns=['label'])
X_test = test.drop(columns=['label'])
y_train = train['label']
y_test = test['label']

X_train = X_train['texts'].apply(clean_text)
X_test = X_test['texts'].apply(clean_text)

pattern = '[^a-z ]'
Clean_X_train=[]
Clean_X_test=[]

for sen in X_train:
    Clean_X_train.append(re.sub(pattern, '', str(sen)))
    
for sen in X_test:
    Clean_X_test.append(re.sub(pattern, '', str(sen)))

In [6]:
y_train=list(y_train)
y_test=list(y_test)

In [7]:
train_df = pd.DataFrame({'X_train': Clean_X_train, 'y_train': y_train})
test_df = pd.DataFrame({'X_test': Clean_X_test, 'y_test': y_test})

In [8]:
# 레이블 값에 따라 데이터프레임을 그룹화하고 각 그룹에서 8000개의 샘플을 랜덤하게 추출
train_df = train_df.groupby('y_train').apply(lambda x: x.sample(n=8000, random_state=42))

# 레이블 값에 따라 데이터프레임을 그룹화하고 각 그룹에서 2000개의 샘플을 랜덤하게 추출
test_df = test_df.groupby('y_test').apply(lambda x: x.sample(n=2000, random_state=42))

# 인덱스를 재설정합니다. drop=True 옵션을 사용하여 기존 인덱스를 제거합니다.
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [9]:
x_train = train_df['X_train'].tolist()
y_train = train_df['y_train'].tolist()
x_test = test_df['X_test'].tolist()
y_test = test_df['y_test'].tolist()

In [10]:
to_txt=x_train+x_test
y=y_train+y_test

In [12]:
#불용어 불러오기
with open('C:/Users/user/Desktop/english.txt', 'r', encoding='utf-8') as file:
    stopwords = [line.strip() for line in file]

In [13]:
#단어 10000개 선별
vect = CountVectorizer(stop_words=stopwords)
X_dtm = vect.fit_transform(to_txt)
X_dtm = X_dtm.toarray()
X_new = SelectKBest(chi2, k=10000).fit(X_dtm, y)
TorF = X_new.get_support()
TorF
import numpy as np
word_view=np.array(vect.get_feature_names_out())
sw=word_view[TorF]



In [14]:
#훈련데이터에서 선별한 10000개 단어 이외 단어는 <ukn>으로 변경
train_transe=[]
for i in range(len(x_train)):
    a=x_train[i].split()
    for j in range(len(a)):
        if a[j] not in sw:
            a[j] = '<ukn>'
    train_transe.append(' '.join(a))

In [15]:
#테스트데이터에서 선별한 10000개 단어 이외 단어는 <ukn>으로 변경
test_transe=[]
for i in range(len(x_test)):
    a=x_test[i].split()
    for j in range(len(a)):
        if a[j] not in sw:
            a[j] = '<ukn>'
    test_transe.append(' '.join(a))

In [16]:
# DTM 확인
dtmvector = CountVectorizer()
x_train_dtm = dtmvector.fit_transform(train_transe)
# print(x_train.shape)

# 테스트데이터
x_test_dtm = dtmvector.transform(test_transe) #테스트 데이터를 DTM으로 변환

In [17]:
# TF-IDF Matrix확인
tfidf_transformer = TfidfTransformer()
tfidfv = tfidf_transformer.fit_transform(x_train_dtm)
print(tfidfv.shape)

# 테스트데이터
tfidfv_test = tfidf_transformer.transform(x_test_dtm) #DTM을 TF-IDF 행렬로 변환

(16000, 9781)


# 전통적인 머신러닝 모델로 학습

In [19]:
#SVM 모델
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'penalty': ['l1', 'l2']}

grid = GridSearchCV(LinearSVC(max_iter=500, dual=False), param_grid, refit=True, verbose=3)

grid.fit(tfidfv, y_train)
print(grid.best_params_)

predicted = grid.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))



Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END .................C=0.1, penalty=l1;, score=0.767 total time=   0.0s
[CV 2/5] END .................C=0.1, penalty=l1;, score=0.760 total time=   0.0s
[CV 3/5] END .................C=0.1, penalty=l1;, score=0.761 total time=   0.0s




[CV 4/5] END .................C=0.1, penalty=l1;, score=0.757 total time=   0.0s
[CV 5/5] END .................C=0.1, penalty=l1;, score=0.771 total time=   0.0s
[CV 1/5] END .................C=0.1, penalty=l2;, score=0.837 total time=   0.0s
[CV 2/5] END .................C=0.1, penalty=l2;, score=0.835 total time=   0.0s
[CV 3/5] END .................C=0.1, penalty=l2;, score=0.843 total time=   0.0s
[CV 4/5] END .................C=0.1, penalty=l2;, score=0.843 total time=   0.0s
[CV 5/5] END .................C=0.1, penalty=l2;, score=0.847 total time=   0.0s




[CV 1/5] END ...................C=1, penalty=l1;, score=0.853 total time=   0.3s




[CV 2/5] END ...................C=1, penalty=l1;, score=0.848 total time=   0.3s




[CV 3/5] END ...................C=1, penalty=l1;, score=0.854 total time=   0.3s




[CV 4/5] END ...................C=1, penalty=l1;, score=0.843 total time=   0.3s




[CV 5/5] END ...................C=1, penalty=l1;, score=0.854 total time=   0.3s
[CV 1/5] END ...................C=1, penalty=l2;, score=0.873 total time=   0.0s
[CV 2/5] END ...................C=1, penalty=l2;, score=0.873 total time=   0.0s
[CV 3/5] END ...................C=1, penalty=l2;, score=0.882 total time=   0.0s
[CV 4/5] END ...................C=1, penalty=l2;, score=0.876 total time=   0.0s
[CV 5/5] END ...................C=1, penalty=l2;, score=0.879 total time=   0.0s




[CV 1/5] END ..................C=10, penalty=l1;, score=0.865 total time=   1.3s




[CV 2/5] END ..................C=10, penalty=l1;, score=0.859 total time=   1.3s




[CV 3/5] END ..................C=10, penalty=l1;, score=0.864 total time=   1.3s




[CV 4/5] END ..................C=10, penalty=l1;, score=0.853 total time=   1.3s




[CV 5/5] END ..................C=10, penalty=l1;, score=0.851 total time=   1.3s
[CV 1/5] END ..................C=10, penalty=l2;, score=0.886 total time=   0.0s
[CV 2/5] END ..................C=10, penalty=l2;, score=0.881 total time=   0.0s
[CV 3/5] END ..................C=10, penalty=l2;, score=0.891 total time=   0.0s
[CV 4/5] END ..................C=10, penalty=l2;, score=0.883 total time=   0.0s
[CV 5/5] END ..................C=10, penalty=l2;, score=0.885 total time=   0.0s




[CV 1/5] END .................C=100, penalty=l1;, score=0.850 total time=   1.1s




[CV 2/5] END .................C=100, penalty=l1;, score=0.836 total time=   1.1s




[CV 3/5] END .................C=100, penalty=l1;, score=0.844 total time=   1.1s




[CV 4/5] END .................C=100, penalty=l1;, score=0.831 total time=   1.1s




[CV 5/5] END .................C=100, penalty=l1;, score=0.837 total time=   1.2s
[CV 1/5] END .................C=100, penalty=l2;, score=0.868 total time=   0.0s
[CV 2/5] END .................C=100, penalty=l2;, score=0.865 total time=   0.0s
[CV 3/5] END .................C=100, penalty=l2;, score=0.870 total time=   0.1s
[CV 4/5] END .................C=100, penalty=l2;, score=0.863 total time=   0.1s
[CV 5/5] END .................C=100, penalty=l2;, score=0.870 total time=   0.0s




[CV 1/5] END ................C=1000, penalty=l1;, score=0.847 total time=   1.2s




[CV 2/5] END ................C=1000, penalty=l1;, score=0.837 total time=   1.2s




[CV 3/5] END ................C=1000, penalty=l1;, score=0.843 total time=   1.2s




[CV 4/5] END ................C=1000, penalty=l1;, score=0.835 total time=   1.2s




[CV 5/5] END ................C=1000, penalty=l1;, score=0.832 total time=   1.2s




[CV 1/5] END ................C=1000, penalty=l2;, score=0.851 total time=   0.9s
[CV 2/5] END ................C=1000, penalty=l2;, score=0.843 total time=   0.7s
[CV 3/5] END ................C=1000, penalty=l2;, score=0.845 total time=   0.8s
[CV 4/5] END ................C=1000, penalty=l2;, score=0.839 total time=   0.8s
[CV 5/5] END ................C=1000, penalty=l2;, score=0.842 total time=   0.9s
{'C': 10, 'penalty': 'l2'}
정확도: 0.872
F1 스코어: 0.8711625566180171
Recall 값: 0.8655




In [21]:
#나이브 베이즈 모델
param_grid = {'alpha': [0.01, 0.1, 0.5, 1, 10, 100],
              'fit_prior': [True, False]}

grid = GridSearchCV(MultinomialNB(), param_grid, refit=True, verbose=3)

grid.fit(tfidfv, y_train)
print(grid.best_params_)

predicted = grid.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END ........alpha=0.01, fit_prior=True;, score=0.887 total time=   0.0s
[CV 2/5] END ........alpha=0.01, fit_prior=True;, score=0.880 total time=   0.0s
[CV 3/5] END ........alpha=0.01, fit_prior=True;, score=0.896 total time=   0.0s
[CV 4/5] END ........alpha=0.01, fit_prior=True;, score=0.897 total time=   0.0s
[CV 5/5] END ........alpha=0.01, fit_prior=True;, score=0.888 total time=   0.0s
[CV 1/5] END .......alpha=0.01, fit_prior=False;, score=0.887 total time=   0.0s
[CV 2/5] END .......alpha=0.01, fit_prior=False;, score=0.880 total time=   0.0s
[CV 3/5] END .......alpha=0.01, fit_prior=False;, score=0.896 total time=   0.0s
[CV 4/5] END .......alpha=0.01, fit_prior=False;, score=0.897 total time=   0.0s
[CV 5/5] END .......alpha=0.01, fit_prior=False;, score=0.888 total time=   0.0s
[CV 1/5] END .........alpha=0.1, fit_prior=True;, score=0.878 total time=   0.0s
[CV 2/5] END .........alpha=0.1, fit_prior=True;

In [23]:
#logistic regression 모델
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'penalty': ['l1', 'l2']}

grid = GridSearchCV(LogisticRegression(), param_grid, refit=True, verbose=3)

grid.fit(tfidfv, y_train)
print(grid.best_params_)

predicted = grid.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 3/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 4/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 5/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 1/5] END .................C=0.1, penalty=l2;, score=0.795 total time=   0.0s
[CV 2/5] END .................C=0.1, penalty=l2;, score=0.797 total time=   0.0s
[CV 3/5] END .................C=0.1, penalty=l2;, score=0.801 total time=   0.0s
[CV 4/5] END .................C=0.1, penalty=l2;, score=0.808 total time=   0.0s
[CV 5/5] END .................C=0.1, penalty=l2;, score=0.800 total time=   0.0s
[CV 1/5] END .....................C=1, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END .....................C=1, penalty=l

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END ..................C=10, penalty=l2;, score=0.874 total time=   0.0s
[CV 3/5] END ..................C=10, penalty=l2;, score=0.882 total time=   0.0s
[CV 4/5] END ..................C=10, penalty=l2;, score=0.874 total time=   0.0s
[CV 5/5] END ..................C=10, penalty=l2;, score=0.877 total time=   0.1s
[CV 1/5] END ...................C=100, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END ...................C=100, penalty=l1;, score=nan total time=   0.0s
[CV 3/5] END ...................C=100, penalty=l1;, score=nan total time=   0.0s
[CV 4/5] END ...................C=100, penalty=l1;, score=nan total time=   0.0s
[CV 5/5] END ...................C=100, penalty=l1;, score=nan total time=   0.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END .................C=100, penalty=l2;, score=0.887 total time=   0.1s
[CV 2/5] END .................C=100, penalty=l2;, score=0.881 total time=   0.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END .................C=100, penalty=l2;, score=0.892 total time=   0.1s
[CV 4/5] END .................C=100, penalty=l2;, score=0.882 total time=   0.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END .................C=100, penalty=l2;, score=0.883 total time=   0.1s
[CV 1/5] END ..................C=1000, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END ..................C=1000, penalty=l1;, score=nan total time=   0.0s
[CV 3/5] END ..................C=1000, penalty=l1;, score=nan total time=   0.0s
[CV 4/5] END ..................C=1000, penalty=l1;, score=nan total time=   0.0s
[CV 5/5] END ..................C=1000, penalty=l1;, score=nan total time=   0.0s
[CV 1/5] END ................C=1000, penalty=l2;, score=0.873 total time=   0.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END ................C=1000, penalty=l2;, score=0.872 total time=   0.1s
[CV 3/5] END ................C=1000, penalty=l2;, score=0.877 total time=   0.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the fai

[CV 4/5] END ................C=1000, penalty=l2;, score=0.874 total time=   0.1s
[CV 5/5] END ................C=1000, penalty=l2;, score=0.877 total time=   0.1s
{'C': 100, 'penalty': 'l2'}
정확도: 0.87275
F1 스코어: 0.8720784116612215
Recall 값: 0.8675


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
