In [23]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pickle
from time import time
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Bidirectional, LSTM, Dropout, Embedding, Dense
from keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from string import punctuation
from sklearn.model_selection import KFold
from pathlib import Path
import os
from matplotlib import rcParams, pyplot as plt
import warnings 
warnings.filterwarnings(action='ignore')
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.initializers import Constant
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras import Sequential, Model, Input
# from attention import Attention

In [24]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

In [25]:
data_dir = Path('./glove.6B')
feature_dir = Path('./glove.6B/feature')
val_dir = Path('./glove.6B/val')
tst_dir = Path('./glove.6B/tst')
sub_dir = Path('./glove.6B/sub')
dirs = [feature_dir, val_dir, tst_dir, sub_dir]
for d in dirs:
    os.makedirs(d, exist_ok=True)
glove_file = data_dir / 'glove.6B.100d.txt'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42

In [26]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)
else:
    print('No GPU detected')

No GPU detected


In [27]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

In [28]:
algo_name = 'lstm'
feature_name = 'glove'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

In [29]:
embeddings_index = {}
with open(glove_file, encoding='utf-8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs
        
print(f'Found {len(embeddings_index)} word vectors.')

Found 400000 word vectors.


# 내 데이터로 돌려보자

In [30]:
trn2 = pd.read_csv('./mydata2.csv', encoding = 'utf-8')
trn2.rename(columns={'Unnamed: 0': 'index'})

Unnamed: 0,index,text,author
0,0,"almost choking. much, much wanted say, strange...",3
1,1,"“your sister asked it, suppose?”",2
2,2,"engaged one day walked, perusing jane’s last l...",1
3,3,"captain porch, keeping carefully way treachero...",4
4,4,"“have mercy, gentlemen!” odin flung hands. “do...",3
...,...,...,...
109753,109753,"“is you, mr. smith?” odin whispered. “i hardly...",2
109754,109754,"told plan captain, us settled details accompli...",4
109755,109755,"""your sincere well-wisher, friend, sister, ""lu...",1
109756,109756,“then wanted lend money?”,3


In [31]:
tst2 = pd.read_csv('./test_x.csv', encoding = 'utf-8')
tst2

Unnamed: 0,index,text
0,0,“Not at all. I think she is one of the most ch...
1,1,"""No,"" replied he, with sudden consciousness, ""..."
2,2,As the lady had stated her intention of scream...
3,3,“And then suddenly in the silence I heard a so...
4,4,His conviction remained unchanged. So far as I...
...,...,...
19612,19612,"At the end of another day or two, odin growing..."
19613,19613,"All afternoon we sat together, mostly in silen..."
19614,19614,"odin, having carried his thanks to odin, proc..."
19615,19615,"Soon after this, upon odin's leaving the room,..."


In [32]:
trn_value = trn2['text'].values
tst_value = tst2['text'].values
y = trn2['author'].values
print(trn_value.shape, tst_value.shape, y.shape)

(109758,) (19617,) (109758,)


In [33]:
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=500)
text_ds = tf.data.Dataset.from_tensor_slices(trn_value).batch(128)
vectorizer.adapt(text_ds)

In [34]:
vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'odin', 'said', 'one']

In [35]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [36]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print(f"Converted {hits} words ({misses} misses)")

Converted 15846 words (4154 misses)


In [37]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=Constant(embedding_matrix),
    trainable=False,
)

In [38]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [42]:
max_len = max(len(i) for i in trn_value)
print(max_len)

1577


In [43]:
from tensorflow import keras
def get_model():
    int_sequences_input = Input(shape=(1,), dtype=tf.string)
    vectorized_sequences = vectorizer(int_sequences_input)
    embedded_sequences = embedding_layer(vectorized_sequences)
    x = Bidirectional(LSTM(64, return_sequences=True))(embedded_sequences)
    x = Bidirectional(LSTM(64))(x)
#    atteintion = Attention()
#    x = x * attention
    preds = Dense(n_class, activation="softmax")(x)
    model = Model(int_sequences_input, preds)
    
    
    # compile model
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(learning_rate=.001),
                 metrics = ['accuracy'])
    return model

In [44]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model, to_categorical

In [None]:
p_val = np.zeros((trn_value.shape[0], n_class))
p_tst = np.zeros((tst_value.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn_value, y), 1):
    print(f'training model for CV #{i}')
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)
    
    clf = get_model() 
    clf.fit(trn_value[i_trn], 
            to_categorical(y[i_trn]),
            validation_data=(trn_value[i_val], to_categorical(y[i_val])),
            epochs=20,
            batch_size=512,
            callbacks=[es])
    p_val[i_val, :] = clf.predict(trn_value[i_val])
    p_tst += clf.predict(tst_value) / n_fold

training model for CV #1
Epoch 1/20
Epoch 2/20
Epoch 3/20

In [None]:
from sklearn.metrics import accuracy_score, log_loss

In [None]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p_val):8.4f}')

In [None]:
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

In [None]:
clf.summary()

In [None]:
import pydot

In [None]:
plot_model(clf)

In [None]:
sub = pd.read_csv('./sample_submission.csv', encoding = 'utf-8')

In [10]:
sub2 = pd.read_csv('./mydata_leemodel6.csv', encoding = 'utf-8')

In [11]:
sub2

Unnamed: 0,index,0,1,2,3,4
0,0,0.0104,4.3737e-01,3.6441e-02,5.1469e-01,1.0776e-03
1,1,0.0930,2.1970e-01,5.3133e-02,3.5842e-01,2.7576e-01
2,2,0.9887,1.0846e-02,1.3740e-04,1.0597e-04,1.8324e-04
3,3,0.0608,6.8418e-04,7.8870e-01,5.0629e-03,1.4474e-01
4,4,0.9939,2.5414e-04,1.6695e-04,4.8308e-03,8.1659e-04
...,...,...,...,...,...,...
19612,19612,0.0011,9.9893e-01,2.4620e-06,4.0554e-06,1.0388e-06
19613,19613,0.1227,2.3310e-03,7.8961e-03,1.2329e-03,8.6580e-01
19614,19614,0.0004,9.9953e-01,2.3427e-06,3.0668e-05,6.0931e-07
19615,19615,0.0001,9.9986e-01,8.3825e-06,3.4599e-06,2.5793e-06


In [14]:
sub

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0
...,...,...,...,...,...
19612,0,0,0,0,0
19613,0,0,0,0,0
19614,0,0,0,0,0
19615,0,0,0,0,0


In [15]:
sub2[sub.columns] = p_tst_file

In [16]:
p_tst_file

PosixPath('glove.6B/tst/lstm_glove.tst.csv')

In [17]:
sub2

Unnamed: 0,index,0,1,2,3,4
0,0,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv
1,1,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv
2,2,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv
3,3,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv
4,4,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv
...,...,...,...,...,...,...
19612,19612,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv
19613,19613,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv
19614,19614,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv
19615,19615,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv,glove.6B/tst/lstm_glove.tst.csv


In [None]:
sub[sub.columns] = p_tst

In [113]:
sub.to_csv('mydata_pretrained2.csv', index = False, encoding = 'utf-8')

# 여기까지