# Preprocess

In [None]:
import json
import glob
import os

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
project_root = 'drive/My Drive/11785 Team Project/'
poem_root = os.path.join(project_root, 'chinese-poetry/json')
strain_root = os.path.join(project_root, 'chinese-poetry/strains/json')
poems_tsv = os.path.join(project_root, 'poems.tsv')

def parse_json():
    tang_poem_json = sorted(glob.glob(os.path.join(poem_root, 'poet.tang.[0-9]*.json')))
    tang_strain_json = sorted(glob.glob(os.path.join(strain_root, 'poet.tang.[0-9]*.json')))
    song_poem_json = sorted(glob.glob(os.path.join(poem_root, 'poet.song.[0-9]*.json')))
    song_strain_json = sorted(glob.glob(os.path.join(strain_root, 'poet.song.[0-9]*.json')))
    
    with open(poems_tsv, 'w') as w:
        w.write('Title\tDynasty\tType\tContent\tStrain\n')
        parse_poems_strains(w, tang_poem_json, tang_strain_json, 'Tang')
        parse_poems_strains(w, song_poem_json, song_strain_json, 'Song')

def parse_poems_strains(w, poem_json, strain_json, dynasty):
    # iterate every json file
    for i in range(len(poem_json)):
        poems = poem_json[i]
        strains = strain_json[i]
        # open one poem and strain json file
        with open(strains, 'r') as f_s:
            with open(poems, 'r') as f_p:
                str_data_p = f_p.read()
                str_data_s = f_s.read()
                dict_data_p = json.loads(str_data_p)
                dict_data_s = json.loads(str_data_s)
                # iterate every poem in a json file
                for j in range(len(dict_data_p)):
                    # iterate every line in a poem
                    parsed_poem = parse_poem(dict_data_p[j], dynasty)
                    if parsed_poem != '':
                        parsed_strain = parse_strain(dict_data_s[j])
                        w.write(parsed_poem + parsed_strain + '\n')

def parse_strain(strain):
    res = ''
    content = strain['strains']
    for i in range(len(content)):
        res += content[i]
    return res

def parse_poem(poem, dynasty):
    res = ''
    content = poem['paragraphs']

    # jueju_5 has 2 sentences and length 12 if plus comma and period each sentence
    # jueju_7 has 2 sentences and length 16 if plus comma and period each sentence
    if (len(content) != 2 or len(content[0]) != 12 or len(content[1]) != 12) and (len(content) != 2 or len(content[0]) != 16 or len(content[1]) != 16):
        return res
    
    res += poem['title']
    res += '\t'

    if dynasty == 'Tang':
        res += '唐\t'
    elif dynasty == 'Song':
        res += '宋\t'

    if (len(content[0]) == 12):
        res += '五言绝句\t'
    elif (len(content[0]) == 16):
        res += '七言绝句\t'

    # res += '【'
    for i in range(len(content)):
        res += content[i]
    # res += '】'
    res += '\t'
    
    return res

In [None]:
parse_json()

# Word2Vec

In [None]:
!pip install panda
!pip install opencc
import torch.nn as nn
import codecs
import os
import numpy as np
import bz2

Collecting opencc
[?25l  Downloading https://files.pythonhosted.org/packages/d5/b4/24e677e135df130fc6989929dc3990a1ae19948daf28beb8f910b4f7b671/OpenCC-1.1.1.post1-py2.py3-none-manylinux1_x86_64.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 3.5MB/s 
[?25hInstalling collected packages: opencc
Successfully installed opencc-1.1.1.post1


In [None]:
project_root = 'drive/My Drive/11785 Team Project/'
path_to_embedding = os.path.join(project_root, 'sgns.sikuquanshu.bigram.bz2')

def load_dense_drop_repeat(path):
    vocab_size, size = 0, 0
    vocab = {}
    vocab["i2w"], vocab["w2i"] = [], {}
    count = 0
    with bz2.open(path, "r") as content:
      # with codecs.open(input_file, "r", encoding="utf-8") as content:
        first_line = True
        for line in content:
            if first_line:
                first_line = False
                vocab_size = int(line.strip().split()[0])
                size = int(line.rstrip().split()[1])
                matrix = np.zeros(shape=(vocab_size, size), dtype=np.float32)
                continue
            vec = line.strip().split()
            if not vocab["w2i"].__contains__(vec[0]):
                vocab["w2i"][vec[0]] = count
                matrix[count, :] = np.array([float(x) for x in vec[1:]])
                count += 1
    for w, i in vocab["w2i"].items():
        # if i <= 5000:
        #     print('i: {} w: {}'.format(i, w.decode('utf-8')))
        vocab["i2w"].append(w)
    return matrix, vocab, size, len(vocab["i2w"])

matrix, vocab, size, unique_size = load_dense_drop_repeat(path_to_embedding)

# Load saved embedding.
# embedding_matrix = np.load(path_to_embedding)
# embedding = nn.Embedding(vocab_size, embedding_size)
# embedding.weight.data.copy_(torch.from_numpy(embedding))
# embedding.weight.requires_grad = True

300 19527


In [None]:
import pandas as pd
import opencc
import re
import numpy as np

# Load input data, transfer traditional chinese to simplified chinese, split 
# it into words, transfer to vectors.
project_root = 'drive/My Drive/11785 Team Project/'
file_name = 'poems.tsv'
converter = opencc.OpenCC('t2s.json')
file = pd.read_csv(''.join((project_root, file_name)), sep='\t', header=0)

type = file['Type']
cnt = 0
total = 0
vector = []  # N * 300, each line is a character.
for idx, poem in enumerate(file['Content']):
    if type[idx] != "五言绝句":
        continue
    lines = poem.strip()
    lines = lines.replace('？','')
    lines = re.split('，|\。',lines)
    tmp = []
    err_cnt = 0
    for line in lines:
        if not line:
            # Skip empty lines.
            continue
        simplify_line = converter.convert(line)
        for ch in simplify_line:
            ch_encode = ch.encode('utf-8')
            if ch_encode not in vocab["w2i"]:
                # Do not have this character in dictionary.
                err_cnt += 1
            else:
                tmp.append(matrix[vocab["w2i"][ch_encode]])
        if err_cnt == 0:
            # Only add valid poems.
            vector.append(tmp)
        cnt += err_cnt
        total += len(simplify_line)
        
# print(cnt, total)  # Invalid character cnt and total cnt.
# print(len(vector), len(file['Content']))
# print(vector.shape, vector)

1509 342859
67118 101164


# Generate train.txt and valid.txt for TXL model

In [None]:
def generateFile(validateNum):
  from google.colab import auth
  auth.authenticate_user()
  import gspread
  from oauth2client.client  import GoogleCredentials
  gc = gspread.authorize(GoogleCredentials.get_application_default())
  import pandas as pd
  import matplotlib.pyplot as plt
  ## please modify the url link when you open the google sheet of in for "poems" in google drive
  wb = gc.open_by_url('https://docs.google.com/spreadsheets/d/11LGwfUQ3x9KqASimVvgYAZwdZUz8qMYeJWfvvNqP2ec/edit#gid=253898745')
  sheet = wb.worksheet('poems')
  data = sheet.get_all_values()
  df = pd.DataFrame(data)
  df.columns = df.iloc[0]
  df = df.iloc[1:]
  df.head()
  import numpy as np
  train = df['Content'][:len(df) - validateNum]
  validate = df['Content'][-validateNum:]
  from itertools import islice
  with open('/content/transformer-xl-chinese/data/11785Shi/train.txt', 'w') as w:
    for index, row in df.iloc[0:len(df) - validateNum].iterrows():
      content = ''
      content += row["Content"]
      w.write(content+'\n')
      

      
  with open('/content/transformer-xl-chinese/data/11785Shi/valid.txt', 'w') as w:
    for index, row in df.iloc[-validateNum:].iterrows():
      content = ''
      content += row["Content"]
      w.write(content+'\n')