In [None]:
import torch
torch.cuda.is_available()

In [None]:
import pandas as pd
import random
import numpy as np
import sklearn
import json
import gzip

random.seed(2021)
np.random.seed(2021)

In [None]:
'''
# Only used for generating the data file for the first time
def read_input(jsonname, txtname, line_limit=None):
    with gzip.open(jsonname, 'r') as fin:        
        jsonl_content = fin.read()        
    
    result = [json.loads(jline) for jline in jsonl_content.splitlines()]
    if line_limit:
        result = result[:line_limit]
        # result = np.random.choice(result, line_limit)
    
    for item in result:
        item['lyrics'] = item['lyrics'].replace('\n', ' [LINE] ')[:1024]
    
    with open(txtname, 'w') as f:        
        for item in result:
            f.writelines(item['lyrics'] + '\n')
    return result
'''

In [None]:
# Clear the output of this notebook since I run the tasks separately, and the results seem messy
import logging

from simpletransformers.language_modeling import LanguageModelingModel, LanguageModelingArgs

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

train_file = "train.txt"
dev_file = "dev.txt"
test_file = "test.txt"

In [None]:
# Using example code from Simple Transformers
model_args = LanguageModelingArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.output_dir = "./output1"
model_args.num_train_epochs = 1
model_args.train_batch_size = 5
model_args.learning_rate = 4e-7
model_args.adam_epsilon = 1e-9
model_args.dataset_type = "simple"
model_args.mlm = False  # mlm must be False for CLM
# model_args.evaluate_during_training = True
# model_args.evaluate_during_training_verbose = True
# model_args.evaluate_during_training_steps = 50000
model_args.max_seq_length = 1024
model_args.manual_seed = 2021  # Set for reproductivity

model1 = LanguageModelingModel(
    "gpt2", "gpt2", use_cuda=True, args=model_args
)

# Train the model
model1.train_model(train_file)

In [None]:
# Evaluate the model
result = model1.eval_model(train_file)
result

In [None]:
result = model1.eval_model(dev_file)
result

In [None]:
model_args = LanguageModelingArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.output_dir = "./output2"
model_args.num_train_epochs = 1
model_args.train_batch_size = 4
model_args.learning_rate = 4e-6
model_args.adam_epsilon = 4e-9
model_args.dataset_type = "simple"
model_args.mlm = False  # mlm must be False for CLM
model_args.max_seq_length = 1024
model_args.manual_seed = 2021

model2 = LanguageModelingModel(
    "gpt2", "gpt2", use_cuda=True, args=model_args
)

model2.train_model(train_file)

In [None]:
result = model2.eval_model(train_file)
result

In [None]:
result = model2.eval_model(dev_file)
result

In [None]:
model_args = LanguageModelingArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.output_dir = "./output3"
model_args.num_train_epochs = 3
model_args.train_batch_size = 5
model_args.learning_rate = 4e-7
model_args.adam_epsilon = 1e-9
model_args.dataset_type = "simple"
model_args.mlm = False  # mlm must be False for CLM
model_args.max_seq_length = 1024
model_args.manual_seed = 2021

model3 = LanguageModelingModel(
    "gpt2", "gpt2", use_cuda=True, args=model_args
)

model3.train_model(train_file)

In [None]:
result = model3.eval_model(train_file)
result

In [None]:
result = model3.eval_model(dev_file)
result

In [None]:
result = model3.eval_model(test_file)  # model3 is the best one
result

In [None]:
# Uncomment the following line and comment the read file operation 
# when running for the first time to generate the corresponding input file
# train = read_input('scratch/song-lyrics.train.jsonl.gz', 'train.txt', 100000)
with open('train.txt', 'r') as f:        
    train = f.readlines()

In [None]:
# dev = read_input('scratch/song-lyrics.dev.jsonl.gz', 'dev.txt')
with open('dev.txt', 'r') as f:        
    dev = f.readlines()

In [None]:
# test = read_input('scratch/song-lyrics.test.jsonl.gz', 'test.txt')
with open('test.txt', 'r') as f:        
    test = f.readlines()

In [None]:
# machine_train = read_input('scratch/lyrics.machine-gen.train.jsonl.gz', 'machine-train.txt')
with open('machine-train.txt', 'r') as f:        
    machine_train = f.readlines()

In [None]:
# machine_dev = read_input('scratch/lyrics.machine-gen.dev.jsonl.gz', 'machine-dev.txt')
with open('machine-dev.txt', 'r') as f:        
    machine_dev = f.readlines()

In [None]:
# machine_test = read_input('scratch/lyrics.machine-gen.test.jsonl.gz', 'machine-test.txt')
with open('machine-test.txt', 'r') as f:        
    machine_test = f.readlines()

In [None]:
def full_data(machine_data, song_data):
    data = []
    for item in song_data:
        data.append([item, 1])
    for item in machine_data:
        data.append([item, 0])
    df = pd.DataFrame(data)
    df.columns = ["text", "labels"]
    return df

In [None]:
from simpletransformers.language_generation import LanguageGenerationModel, LanguageGenerationArgs

model_args = LanguageGenerationArgs()
model_args.manual_seed=2021
model_args.max_length=1024
text_generator = LanguageGenerationModel('gpt2', './output3', use_cuda=True,args=model_args)

prompts = ["My", "The", "One", "When", "If"]

for prompt in prompts:
    generated = text_generator.generate(prompt)
    generated = generated[0].replace('[LINE]', '\n')
    print('Prompt:', prompt, '\n')
    print('Generated text:', generated, '\n')

In [None]:
sample = np.random.choice(train, 500)
generated = {}

for index in range(0, len(sample)):
    prompt = sample[index].split(' ',1)[0]
    # print(prompt)
    temp = text_generator.generate(prompt, args={'max_length': 1000})
    generated[index] = temp[0].replace('[LINE]', '\n')

with open('generated.json', 'w') as file:
    json.dump(generated, file)

In [None]:
with open('generated.json', 'r') as file:
    generated = json.load(file)
part2_data = []
for index in range(0, 500):
    text = generated[str(index)].replace('\n', '[LINE]')
    part2_data.append([text, 0])
part2_df = pd.DataFrame(part2_data)
part2_df.columns = ["text", "labels"]
part2_df.sample(5)

In [None]:
train_df = full_data(machine_train, train)
train_df.sample(5)

In [None]:
dev_df = full_data(machine_dev, dev)
dev_df.sample(5)

In [None]:
test_df = full_data(machine_test, test)
test_df.sample(5)

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
model_args = ClassificationArgs()
model_args.num_train_epochs = 3
model_args.train_batch_size = 5
model_args.learning_rate = 4e-7
model_args.adam_epsilon = 1e-8
model_args.output_dir = "./output-class"
model_args.manual_seed = 2021
model_args.mlm = True

model = ClassificationModel("distilbert", "distilbert-base-cased", use_cuda=True, args=model_args)

# Train the model
model.train_model(train_df, acc=sklearn.metrics.accuracy_score)

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(train_df, acc=sklearn.metrics.accuracy_score)
# Probably due to the large size of data, the execution would sometimes stuck after evaluation. Restarting the kernel and 
# running the evaluations separately could solve the problem.
result

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(dev_df, acc=sklearn.metrics.accuracy_score)
result

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(test_df, acc=sklearn.metrics.accuracy_score)
result

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(part2_df, acc=sklearn.metrics.accuracy_score)
result