In [114]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import numpy as np

In [115]:
# Get the text prior to long format
original_text = pd.read_hdf("C:\\Users\\tommy\\OneDrive\\University\\Year 3\\Third Year Project\\Platform Album Data\\unigram_data.h5", key="original")

In [116]:
# This is the text in long format
ordered_text = pd.read_hdf("C:\\Users\\tommy\\OneDrive\\University\\Year 3\\Third Year Project\\Platform Album Data\\new_unigram_data.h5", key="text")

In [117]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize.nist import NISTTokenizer

def replace_quotes(x):
    for match in re.findall(" “.+?[”'\1{2}]", x):
        x = x.replace(match, "")

    return x

def preprocess(x):
    tk = NISTTokenizer()
    x = replace_quotes(x)
    x = sent_tokenize(x)
    x = [[word for word in tk.tokenize(sentence) if word not in string.punctuation] for sentence in x]
    
    return x

platforms = ["Pitchfork", "Guardian", "Spectrum", "NME"]

# Tokenize by word and sentence, replace quotes and remove punctuation
for platform in platforms:
    
    original_text.loc[:, platform] = original_text.loc[:, platform].apply(preprocess)

In [118]:
# Put into long format
original_text = original_text.melt(id_vars=["Artist", "Album"], value_vars=["Pitchfork", "Guardian", "Spectrum", "NME"], var_name="Platform", value_name="Text")

In [119]:
# Get rid of nan occurrences from empty splits on tokenization
original_text.loc[:, "Text"] = original_text.loc[:, "Text"].apply(lambda x: [[word for word in sentence if word != np.nan] for sentence in x])

In [120]:
# Join with a weird character so that it can be easily split again without interfering with the data
# In hindsight could've just used a whitespace
original_text["Full Text"] = original_text.loc[:, "Text"].apply(lambda x: "|".join(["|".join(sent) for sent in x]))

In [121]:
# Get rid of duplicate rows where multiple guardian reviews were present
original_text = original_text.drop_duplicates(subset=["Full Text"]).drop(columns=["Full Text"])

In [122]:
# Sort by the same ordering as the text
artist_album_sort = ordered_text.loc[:, ["Artist", "Album"]]
# pd.categorical allows you to sort by the index of ordered_text
original_text.loc[:, "Artist"] = pd.Categorical(original_text.loc[:, "Artist"], artist_album_sort.loc[:, "Artist"].unique())
original_text.loc[:, "Album"] = pd.Categorical(original_text.loc[:, "Album"], artist_album_sort.loc[:, "Album"].unique())
original_text.loc[:, "Platform"] = pd.Categorical(original_text.loc[:, "Platform"], ["Pitchfork", "Guardian", "Spectrum", "NME"])
# Use the sorting methods outlined above to sort the entire dataset
original_text = original_text.sort_values(["Platform", "Album", "Artist"]).reset_index(drop=True)

  original_text.loc[:, "Artist"] = pd.Categorical(original_text.loc[:, "Artist"], artist_album_sort.loc[:, "Artist"].unique())
  original_text.loc[:, "Album"] = pd.Categorical(original_text.loc[:, "Album"], artist_album_sort.loc[:, "Album"].unique())
  original_text.loc[:, "Platform"] = pd.Categorical(original_text.loc[:, "Platform"], ["Pitchfork", "Guardian", "Spectrum", "NME"])


In [123]:
original_text["Review id"] = [x for x in range(original_text.shape[0])]

In [124]:
ordering = ordered_text.copy()
ordering.loc[:, "Artist"] = pd.Categorical(ordering.loc[:, "Artist"], ordered_text.loc[:, "Artist"].unique(), True)
ordering.loc[:, "Album"] = pd.Categorical(ordering.loc[:, "Album"], ordered_text.loc[:, "Album"].unique(), True)
ordering.loc[:, "Platform"] = pd.Categorical(ordering.loc[:, "Platform"], ["Pitchfork", "Guardian", "Spectrum", "NME"], True)

  ordering.loc[:, "Artist"] = pd.Categorical(ordering.loc[:, "Artist"], ordered_text.loc[:, "Artist"].unique(), True)
  ordering.loc[:, "Album"] = pd.Categorical(ordering.loc[:, "Album"], ordered_text.loc[:, "Album"].unique(), True)
  ordering.loc[:, "Platform"] = pd.Categorical(ordering.loc[:, "Platform"], ["Pitchfork", "Guardian", "Spectrum", "NME"], True)


In [126]:
ordering = ordering.sort_values(by=["Platform", "Album", "Artist"])
original_text.loc[:, "Review id"] = ordering.loc[:, "Review id"].tolist()

In [133]:
original_text.loc[:, "Text"] = original_text.apply(lambda x: list(x))

In [134]:
original_text = original_text.astype({"Artist": str,
                    "Album": str,
                    "Platform": str,
                    "Text": str})

In [135]:
import ast
original_text.loc[:, "Text"] = original_text.loc[:, "Text"].apply(
    lambda x: str([sentence for sentence in ast.literal_eval(x) if len(sentence) > 0]))

In [136]:
original_text.to_hdf("C:\\Users\\tommy\\OneDrive\\University\\Year 3\\Third Year Project\\Platform Album Data\\new_unigram_data.h5", key="sentence", format="table")