In [None]:
!pip install transformers
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import transformers
from transformers import RobertaTokenizer, TFRobertaModel
transformers.logging.set_verbosity_error()
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

###Data Preprocessing


In [2]:
df_train = pd.read_csv("https://raw.githubusercontent.com/thomascourtney/nlp_final_project/main/data/Poem_classification%20-%20train_data.csv")
df_test = pd.read_csv("https://raw.githubusercontent.com/thomascourtney/nlp_final_project/main/data/Poem_classification%20-%20test_data.csv")

In [3]:
df_train = df_train.dropna(axis=0)
df_train.isnull().sum()

Genre    0
Poem     0
dtype: int64

In [4]:
df_test.isnull().sum()

Genre    0
Poem     0
dtype: int64

In [5]:
display(df_train)
display(df_test)

Unnamed: 0,Genre,Poem
1,Music,In the thick brushthey spend the...
2,Music,Storms are generous. ...
3,Music,—After Ana Mendieta Did you carry around the ...
4,Music,for Aja Sherrard at 20The portent may itself ...
5,Music,"for Bob Marley, Bavaria, November 1980 Here i..."
...,...,...
836,Environment,Why make so much of fragmentary blue In here a...
837,Environment,"Woman, I wish I didn't know your name. What co..."
838,Environment,"Yonder to the kiosk, beside the creek, Paddle ..."
839,Environment,You come to fetch me from my work to-night Whe...


Unnamed: 0,Genre,Poem
0,Music,A woman walks by the bench I’m sitting onwith ...
1,Music,"Because I am a boy, the untouchability of beau..."
2,Music,"Because today we did not leave this world,We n..."
3,Music,"Big Bend has been here, been here. Shouldn’t i..."
4,Music,"I put shells there, along the lip of the road...."
...,...,...
145,Environment,"To pick a tulip from the garden, the red one. ..."
146,Environment,We are as clouds that veil the midnight moon; ...
147,Environment,"When pulled, the spider web took another form...."
148,Environment,Whose woods these are I think I know. His hous...


In [6]:
df_train.Genre.value_counts()


Music          238
Death          231
Environment    227
Affection      141
Name: Genre, dtype: int64

In [7]:
df_test.Genre.value_counts()

Affection      100
Environment     25
Death           13
Music           12
Name: Genre, dtype: int64

In [9]:
X_train = df_train['Poem']
X_train

1                    In the thick brushthey spend the...
2         Storms are generous.                       ...
3       —After Ana Mendieta Did you carry around the ...
4       for Aja Sherrard at 20The portent may itself ...
5       for Bob Marley, Bavaria, November 1980 Here i...
                             ...                        
836    Why make so much of fragmentary blue In here a...
837    Woman, I wish I didn't know your name. What co...
838    Yonder to the kiosk, beside the creek, Paddle ...
839    You come to fetch me from my work to-night Whe...
840    You see them through water and glass, (both li...
Name: Poem, Length: 837, dtype: object

In [11]:
y_train = df_train['Genre']
y_train

1            Music
2            Music
3            Music
4            Music
5            Music
          ...     
836    Environment
837    Environment
838    Environment
839    Environment
840    Environment
Name: Genre, Length: 837, dtype: object

In [13]:
X_test = df_test["Poem"]
X_test

0      A woman walks by the bench I’m sitting onwith ...
1      Because I am a boy, the untouchability of beau...
2      Because today we did not leave this world,We n...
3      Big Bend has been here, been here. Shouldn’t i...
4      I put shells there, along the lip of the road....
                             ...                        
145    To pick a tulip from the garden, the red one. ...
146    We are as clouds that veil the midnight moon; ...
147    When pulled, the spider web took another form....
148    Whose woods these are I think I know. His hous...
149    you can make the maples blazejust by stopping ...
Name: Poem, Length: 150, dtype: object

In [14]:
y_test = df_test['Genre']
y_test

0            Music
1            Music
2            Music
3            Music
4            Music
          ...     
145    Environment
146    Environment
147    Environment
148    Environment
149    Environment
Name: Genre, Length: 150, dtype: object

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer(stop_words='english', max_df=100) 

count_vector.fit(X_train)
X_train_count_vector = count_vector.transform(X_train)
X_test_count_vector = count_vector.transform(X_test)

X_train_count_vector.shape, X_test_count_vector.shape

((837, 8169), (150, 8169))

In [17]:
X_train_count_vector, X_test_count_vector

(<837x8169 sparse matrix of type '<class 'numpy.int64'>'
 	with 18814 stored elements in Compressed Sparse Row format>,
 <150x8169 sparse matrix of type '<class 'numpy.int64'>'
 	with 2522 stored elements in Compressed Sparse Row format>)

###Baseline Models

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression(max_iter=5000)
lr_clf.fit(X_train_count_vector, y_train)
pred = lr_clf.predict(X_test_count_vector)

accuracy_score(y_test, pred)

0.34

In [20]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

print(confusion_matrix(y_test, pred))


[[25 23 21 31]
 [ 2  3  5  3]
 [ 1  6 14  4]
 [ 0  0  3  9]]
              precision    recall  f1-score   support

   Affection       0.89      0.25      0.39       100
       Death       0.09      0.23      0.13        13
 Environment       0.33      0.56      0.41        25
       Music       0.19      0.75      0.31        12

    accuracy                           0.34       150
   macro avg       0.38      0.45      0.31       150
weighted avg       0.67      0.34      0.37       150



(None, None)

In [21]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

   Affection       0.89      0.25      0.39       100
       Death       0.09      0.23      0.13        13
 Environment       0.33      0.56      0.41        25
       Music       0.19      0.75      0.31        12

    accuracy                           0.34       150
   macro avg       0.38      0.45      0.31       150
weighted avg       0.67      0.34      0.37       150



In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vector = TfidfVectorizer(stop_words='english', max_df=100)

tfidf_vector.fit(X_train)

X_train_tfidf_vector = tfidf_vector.transform(X_train)
X_test_tfidf_vector = tfidf_vector.transform(X_test)

X_train_tfidf_vector.shape, X_test_tfidf_vector.shape

((837, 8169), (150, 8169))

In [23]:
lr_clf_2 = LogisticRegression(max_iter=5000)
lr_clf_2.fit(X_train_tfidf_vector, y_train)
pred = lr_clf_2.predict(X_test_tfidf_vector)

accuracy_score(y_test, pred)

0.28

In [24]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('count_vector', CountVectorizer(stop_words='english', max_df=100)),
                    ('lr_clf', LogisticRegression())])

In [25]:
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

accuracy_score(y_test, pred)

0.34

In [26]:
from sklearn.svm import SVC

linearsvc = SVC()

pipeline = Pipeline([('count_vector', CountVectorizer(stop_words='english', max_df=100)),
                    ('linearsvc', SVC(kernel='linear'))])

pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

accuracy_score(y_test, pred)

0.3333333333333333

In [27]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()

pipeline = Pipeline([('count_vector', CountVectorizer(stop_words='english', max_df=100)),
                    ('mnb', MultinomialNB())])

pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

accuracy_score(y_test, pred)

0.2866666666666667

In [28]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(10,10,10), max_iter=5000, random_state=2211)

pipeline = Pipeline([('count_vector', CountVectorizer(stop_words='english', max_df=100)),
                    ('mlp', MLPClassifier(hidden_layer_sizes=(10,10,10), max_iter=5000, random_state=2211))])

pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

accuracy_score(y_test, pred)

0.31333333333333335

###Embedding Models


RoBERTa

In [29]:
Name0=df_train['Genre'].unique().tolist()
Name=sorted(Name0)
N=list(range(len(Name)))
normal_mapping=dict(zip(Name,N)) 
reverse_mapping=dict(zip(N,Name)) 
print(normal_mapping)
df_train['Genre']=df_train['Genre'].map(normal_mapping)
df_test['Genre']=df_test['Genre'].map(normal_mapping)

{'Affection': 0, 'Death': 1, 'Environment': 2, 'Music': 3}


In [31]:
import re
df_train['Poem'] = df_train['Poem'].replace(re.compile(r'[\n\r\t]'), ' ', regex=True)
df_test['Poem'] = df_test['Poem'].replace(re.compile(r'[\n\r\t]'), ' ', regex=True)

In [35]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
max_len = 128

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [36]:
def create_data(text):
    
    encoded = tokenizer.batch_encode_plus(
        text,
        add_special_tokens = True,
        max_length= max_len,
        padding='max_length',
        truncation=True,
        return_attention_mask=True)

    input_ids       = np.array(encoded["input_ids"], dtype="int32")
    attention_masks = np.array(encoded["attention_mask"], dtype="int32")

    return {"input_ids": input_ids, "attention_masks": attention_masks}

In [38]:
train_data   = create_data(df_train['Poem'])
train = []
train.append(df_train['Genre'].to_list())

In [39]:
def build_model():
    
    model_ids  = Input(shape=(max_len, ), dtype = tf.int32)
    model_mask = Input(shape=(max_len, ), dtype = tf.int32)
    
    roberta_model = TFRobertaModel.from_pretrained("../input/roberta-base/")
    
    x = roberta_model(input_ids = model_ids, 
                      attention_mask = model_mask)       
    x = tf.keras.layers.GlobalAveragePooling1D()(x.last_hidden_state)    
    outputs = Dense(len(Name))(x)
    
    model = tf.keras.Model(inputs = [model_ids, model_mask], outputs = outputs)
    
    model.compile(
        optimizer = tf.keras.optimizers.Adam(),
        loss = "mse",
        metrics=["mse"])
    return model

In [40]:
def scheduler(epoch):
    learning_rate = 2e-5
    if epoch == 0:
        return learning_rate * 0.05
    else:
        return learning_rate * (0.9**epoch)
    
callback_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [41]:
def get_model(train_col) :
    
    model = build_model()
    model.fit((np.array(train_data['input_ids']),
               np.array(train_data['attention_masks'])),
        np.array(train_col).ravel(), 
        epochs = 5,
        shuffle=True,
        callbacks = [EarlyStopping(monitor='val_mse', patience=3, restore_best_weights=True), 
                     ModelCheckpoint('roberta_uspppm.h5', monitor='val_mse', 
                                     save_best_only=True, save_weights_only=True), 
                     callback_lr],                     
        batch_size = 16,
        validation_split=0.2 )
    
    return model

In [42]:
%%time

target_cols = ['Genre']
models = {}

for i, col in enumerate(target_cols) :
            
    print (f"-------------- Model for {col} ---------------")
    model = get_model(train[i])
    models[i] = model

-------------- Model for Genre ---------------


OSError: ignored