In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize.nist import NISTTokenizer
from nltk.corpus import opinion_lexicon

In [3]:
text = pd.read_hdf("C:\\Users\\tommy\\OneDrive\\University\\Year 3\\Third Year Project\\Platform Album Data\\new_unigram_data.h5", key="text")
X_original = pd.read_hdf("C:\\Users\\tommy\\OneDrive\\University\\Year 3\\Third Year Project\\Platform Album Data\\new_unigram_data.h5", key="form")

In [4]:
# Put the words into lower case
X_no_stop = X_original.copy()
X_no_stop.loc[:, "Word"] = X_no_stop.loc[:, "Word"].astype(
    str).apply(
    lambda x: x.lower())

In [5]:
# Remove the stop words so that we have the reduced data frame
X_no_stop = X_no_stop[X_no_stop.loc[:, "Word"].apply(lambda x: x not in stopwords.words("english"))]

In [6]:
X_no_stop.to_hdf("C:\\Users\\tommy\\OneDrive\\University\\Year 3\\Third Year Project\\Platform Album Data\\new_unigram_data.h5", key="no_stop", mode="a")

In [7]:
labelled = X_no_stop.loc[X_no_stop["Desired"] == 1, ["Review id"]].drop_duplicates()

In [8]:
labelled

Unnamed: 0,Review id
301,0
779,175
949,363
1648,538
2012,1
2881,176
3320,364
4058,539
4577,2
6467,177


In [9]:
# Get the text for each platform for the albums that have been labelled
long_text = pd.merge(labelled, text, on=["Review id"], how="inner")
# Get the labels for each word in the labelled albums
X_labelled = pd.merge(labelled, X_no_stop, on=["Review id"], how="inner")

In [10]:
# Check we have all the desired words in correctly
X_no_stop.loc[X_no_stop["Desired"] == 1]

Unnamed: 0,Artist,Album,Platform,Review id,Word id,Word,Desired
301,sleaford mods,key markets,Pitchfork,0,301,working,1.0
302,sleaford mods,key markets,Pitchfork,0,302,class,1.0
304,sleaford mods,key markets,Pitchfork,0,304,politics,1.0
779,sleaford mods,key markets,Guardian,175,152,anger,1.0
793,sleaford mods,key markets,Guardian,175,166,humour,1.0
...,...,...,...,...,...,...,...
438476,bjork,fossora,NME,713,418,processing,1.0
438478,bjork,fossora,NME,713,420,progressing,1.0
438620,bjork,fossora,NME,713,562,love,1.0
438705,bjork,fossora,NME,713,647,reinfatuation,1.0


In [11]:
X_labelled.loc[(X_labelled["Artist"] == "blood orange") & (X_labelled["Platform"] == "Guardian")]

Unnamed: 0,Review id,Artist,Album,Platform,Word id,Word,Desired
7337,179,blood orange,negro swan,Guardian,0,british-born,-1.0
7338,179,blood orange,negro swan,Guardian,1,musician,-1.0
7339,179,blood orange,negro swan,Guardian,2,devonté,-1.0
7340,179,blood orange,negro swan,Guardian,3,hynes,-1.0
7341,179,blood orange,negro swan,Guardian,5,quietly,-1.0
...,...,...,...,...,...,...,...
7506,179,blood orange,negro swan,Guardian,270,self,-1.0
7507,179,blood orange,negro swan,Guardian,271,negro,-1.0
7508,179,blood orange,negro swan,Guardian,272,swan,-1.0
7509,179,blood orange,negro swan,Guardian,275,dizzying,-1.0


In [12]:
# POS tag all words in the relevant reviews
long_text.loc[:, "Text"] = long_text.loc[:, "Text"].apply(lambda x: nltk.pos_tag(x))

In [13]:
# Explode so that each row contains one word
long_text = long_text.explode("Text")

In [14]:
# Convert all words to lower case

def convert_lower(x):
    
    if type(x) == str:
        return x.lower()
    else:
        return (x[0].lower(), x[1])
    
long_text.loc[:, "Text"] = long_text.loc[:, "Text"].apply(convert_lower)

In [15]:
# Remove all stop words from the tagged text

def check_stop(x):
    
    if type(x) != tuple:
        return True
    else:
        if x[0] in stopwords.words("english"):
            return False
        else:
            return True
        
long_text["not_stop"] = long_text.loc[:, "Text"].apply(check_stop)

In [16]:
# Remove stop words
long_text = long_text.loc[long_text.loc[:, "not_stop"]].drop(columns=["not_stop"])

In [17]:
def return_tag(x):
        
    if type(x) != tuple:
        return "NA"
    
    else:
        return x[1]

# Get the previous and next tag for each tag
long_text.loc[:, "ptag"] = long_text.loc[:, "Text"].shift(1).apply(return_tag)
long_text.loc[:, "ntag"] = long_text.loc[:, "Text"].shift(-1).apply(return_tag)
long_text

Unnamed: 0,Review id,Artist,Album,Platform,Text,ptag,ntag
0,0,sleaford mods,key markets,Pitchfork,"(title, NN)",,NNP
0,0,sleaford mods,key markets,Pitchfork,"(key, NNP)",NN,NNP
0,0,sleaford mods,key markets,Pitchfork,"(markets—, NNP)",NNP,NN
0,0,sleaford mods,key markets,Pitchfork,"(something, NN)",NNP,NNP
0,0,sleaford mods,key markets,Pitchfork,"(sleaford, NNP)",NN,NNP
...,...,...,...,...,...,...,...
23,713,bjork,fossora,NME,"(emily, NNP)",NN,NNP
23,713,bjork,fossora,NME,"(mackay, NNP)",NNP,CD
23,713,bjork,fossora,NME,"(29th, CD)",NNP,NNP
23,713,bjork,fossora,NME,"(september, NNP)",CD,CD


In [18]:
# Separate tuple into word and tag and get rid of the original text column
long_text.loc[:, "Word"] = long_text.loc[:, "Text"].apply(lambda x: x[0])
long_text.loc[:, "Tag"] = long_text.loc[:, "Text"].apply(lambda x: x[1])
long_text = long_text.drop(columns=["Text"])
long_text

Unnamed: 0,Review id,Artist,Album,Platform,ptag,ntag,Word,Tag
0,0,sleaford mods,key markets,Pitchfork,,NNP,title,NN
0,0,sleaford mods,key markets,Pitchfork,NN,NNP,key,NNP
0,0,sleaford mods,key markets,Pitchfork,NNP,NN,markets—,NNP
0,0,sleaford mods,key markets,Pitchfork,NNP,NNP,something,NN
0,0,sleaford mods,key markets,Pitchfork,NN,NNP,sleaford,NNP
...,...,...,...,...,...,...,...,...
23,713,bjork,fossora,NME,NN,NNP,emily,NNP
23,713,bjork,fossora,NME,NNP,CD,mackay,NNP
23,713,bjork,fossora,NME,NNP,NNP,29th,CD
23,713,bjork,fossora,NME,CD,CD,september,NNP


In [19]:
# Retrieve the desired labels as well as the review id and word ids
long_text = long_text.reset_index(drop=True).join(X_labelled.reset_index(drop=True).loc[:, ["Word id", "Desired"]])

In [20]:
long_text

Unnamed: 0,Review id,Artist,Album,Platform,ptag,ntag,Word,Tag,Word id,Desired
0,0,sleaford mods,key markets,Pitchfork,,NNP,title,NN,1,-1.0
1,0,sleaford mods,key markets,Pitchfork,NN,NNP,key,NNP,3,-1.0
2,0,sleaford mods,key markets,Pitchfork,NNP,NN,markets—,NNP,4,-1.0
3,0,sleaford mods,key markets,Pitchfork,NNP,NNP,something,NN,5,-1.0
4,0,sleaford mods,key markets,Pitchfork,NN,NNP,sleaford,NNP,7,-1.0
...,...,...,...,...,...,...,...,...,...,...
10167,713,bjork,fossora,NME,NN,NNP,emily,NNP,775,-1.0
10168,713,bjork,fossora,NME,NNP,CD,mackay,NNP,776,-1.0
10169,713,bjork,fossora,NME,NNP,NNP,29th,CD,777,-1.0
10170,713,bjork,fossora,NME,CD,CD,september,NNP,778,-1.0


In [21]:
long_sentence = pd.read_hdf("C:\\Users\\tommy\\OneDrive\\University\\Year 3\\Third Year Project\\Platform Album Data\\new_unigram_data.h5", key="sentence")

In [22]:
long_sentence

Unnamed: 0,Artist,Album,Platform,Text,Review id
0,sleaford mods,key markets,Pitchfork,"[['The', 'title', 'of', 'Key', 'Markets—', 'so...",0
1,black midi,hellfire,Pitchfork,"[['The', 'preposterously', 'talented', 'Englis...",1
2,kanye west,yeezus,Pitchfork,"[['Marking', 'a', 'blunt', 'break', 'with', 't...",2
3,kehlani,it was good until it wasn't,Pitchfork,"[['The', 'cloudy', 'grooves', 'of', 'the', 'Oa...",3
4,blood orange,negro swan,Pitchfork,"[['Dev', 'Hynes’', 'fourth', 'album', 'as', 'B...",4
...,...,...,...,...,...
709,father john misty,fear fun,NME,"[['Having', 'officially', 'taken', 'leave', 'o...",709
710,beyonce,4,NME,"[['It’s', 'not', 'been', 'a', 'vintage', 'year...",710
711,toro y moi,anything in return,NME,"[['Chazwick', 'Bundick', 'aka', 'Toro', 'Y', '...",711
712,james blake,assume form,NME,"[['James', 'Blake', 'James', 'Blake', 'has', '...",712


In [23]:
import ast

long_sentence.loc[:, "Text"] = long_sentence.loc[:, "Text"].apply(ast.literal_eval)

In [24]:
# Calculate the sentence number of every word
long_sentence = long_sentence.explode("Text")
long_sentence["Sentence id"] = long_sentence.groupby(["Review id"]).cumcount()

In [25]:
long_sentence

Unnamed: 0,Artist,Album,Platform,Text,Review id,Sentence id
0,sleaford mods,key markets,Pitchfork,"[The, title, of, Key, Markets—, something, bet...",0,0
0,sleaford mods,key markets,Pitchfork,"[Jason, Williamson's, speaking, voice, is, glo...",0,1
0,sleaford mods,key markets,Pitchfork,"[It's, the, cornerstone, of, Sleaford, Mods', ...",0,2
0,sleaford mods,key markets,Pitchfork,"[On, stage, the, two, of, them, are, a, deligh...",0,3
0,sleaford mods,key markets,Pitchfork,"[Williamson, and, Fearn, are, both, weary-look...",0,4
...,...,...,...,...,...,...
713,bjork,fossora,NME,"[Fungal, City, is, dizzy, with, lust, The, son...",713,16
713,bjork,fossora,NME,"[And, pop, has, to, an, extent, caught, up, wi...",713,17
713,bjork,fossora,NME,"[That’s, a, good, thing, –, perhaps, as, Björk...",713,18
713,bjork,fossora,NME,"[On, this, form, she, sounds, like, she’s, rea...",713,19


In [26]:
# Calculate the word number for each word
long_sentence = long_sentence.explode("Text")
long_sentence["Word_Sentence id"] = long_sentence.groupby(["Review id", "Sentence id"]).cumcount()
long_sentence["Word id"] = long_sentence.groupby(["Review id"]).cumcount()

In [27]:
long_sentence = long_sentence.reset_index(drop=True)

In [28]:
long_sentence

Unnamed: 0,Artist,Album,Platform,Text,Review id,Sentence id,Word_Sentence id,Word id
0,sleaford mods,key markets,Pitchfork,The,0,0,0,0
1,sleaford mods,key markets,Pitchfork,title,0,0,1,1
2,sleaford mods,key markets,Pitchfork,of,0,0,2,2
3,sleaford mods,key markets,Pitchfork,Key,0,0,3,3
4,sleaford mods,key markets,Pitchfork,Markets—,0,0,4,4
...,...,...,...,...,...,...,...,...
438839,bjork,fossora,NME,Emily,713,20,33,775
438840,bjork,fossora,NME,Mackay,713,20,34,776
438841,bjork,fossora,NME,29th,713,20,35,777
438842,bjork,fossora,NME,September,713,20,36,778


In [29]:
# Keep a copy of the full dataset for sentences
complete_long_sentence = long_sentence.copy()

In [30]:
# To lower case
long_sentence.loc[:, "Text"] = long_sentence.loc[:, "Text"].astype(str).apply(convert_lower)

In [31]:
# Remove stopwords

def sent_check_stop(x):
    
    if x in stopwords.words("english"):
        return False
    else:
        return True

long_sentence["not stop"] = long_sentence.loc[:, "Text"].apply(sent_check_stop)
long_sentence = long_sentence.loc[long_sentence.loc[:, "not stop"]].drop(columns="not stop")

In [32]:
long_sentence

Unnamed: 0,Artist,Album,Platform,Text,Review id,Sentence id,Word_Sentence id,Word id
1,sleaford mods,key markets,Pitchfork,title,0,0,1,1
3,sleaford mods,key markets,Pitchfork,key,0,0,3,3
4,sleaford mods,key markets,Pitchfork,markets—,0,0,4,4
5,sleaford mods,key markets,Pitchfork,something,0,0,5,5
7,sleaford mods,key markets,Pitchfork,sleaford,0,0,7,7
...,...,...,...,...,...,...,...,...
438839,bjork,fossora,NME,emily,713,20,33,775
438840,bjork,fossora,NME,mackay,713,20,34,776
438841,bjork,fossora,NME,29th,713,20,35,777
438842,bjork,fossora,NME,september,713,20,36,778


In [69]:
sent_word = pd.merge(long_text, long_sentence.rename({"Text": "Word"}, axis=1), on=["Review id", "Word id", "Word", "Artist", "Album", "Platform"], how="inner")

In [70]:
from nltk.stem import WordNetLemmatizer

# Get all labelled text
tf_album = pd.merge(labelled, text, on=["Review id"], how="inner")

def list_remove_stop(x):
    
    output = []
    
    for word in x:
        if word not in stopwords.words("english"):
            output.append(word)
            
    return output

def list_to_lower(x):
    
    return [word.lower() for word in x]

tf_album.loc[:, "Text"] = tf_album.loc[:, "Text"].apply(list_to_lower).apply(list_remove_stop)

def join_lists(x):
    
    output = []
            
    for l in x.tolist():
        output = output + l
        
    return output

# Get all text for every album
tf_album = tf_album.groupby(["Album"]).agg({"Text": join_lists})

lemmatiser = WordNetLemmatizer()

# Lemmatise every word for each album
tf_album.loc[:, "Text"] = tf_album.loc[:, "Text"].apply(lambda x: [lemmatiser.lemmatize(word) for word in x])

counts = {}

# For each album get all the relevant text
# Count the occurrences of each word and divide by the total number of word to get the relative frequency of each lemmatised word
for album in tf_album.index:
    
    album_text = tf_album.loc[album, "Text"]
    counts[album] = pd.value_counts(np.array(album_text)) / len(album_text)

In [71]:
# Create a new column that contains the relative frequency of each word in all reviews of that album
sent_word["album_tf"] = sent_word.apply(lambda x: counts[x.loc["Album"]][lemmatiser.lemmatize(x.loc["Word"])], axis=1) 

In [72]:
# Get a copy of the sentence data frame including:
# Ids for word, sentence_word and sentence
# Rows for each word
# All usual album data name, artist, platform
opinion_sentence = complete_long_sentence.copy()
# Make this a set or it takes forever
opinion_words = set(opinion_lexicon.words())

# Get a boolean mask for whether each word is an opinion word
opinion_sentence["Opinion Word"] = opinion_sentence.loc[:, "Text"].apply(lambda x: x in opinion_words)
# Use the integer form of this as a feature in the sent_word dataset
sent_word = pd.merge(sent_word, opinion_sentence.loc[:, ["Sentence id", "Word id", "Review id", "Opinion Word"]],
                     on=["Sentence id", "Word id", "Review id"], how="inner")
sent_word.loc[:, "Opinion Word"] = sent_word.loc[:, "Opinion Word"].astype(int)
# Group by sentence and review id 
opinion_sentence_group = opinion_sentence.groupby(["Sentence id", "Review id"])
# Get the sum of opinion words for each sentence
opinion_sentence_agg = opinion_sentence_group.agg({"Opinion Word": "sum"}).reset_index()
# Get the number of words in each sentence
opinion_sentence_count = opinion_sentence_group.size().reset_index(name="Count")

# Get number of opinion and total words
opinion_sentence_agg = pd.merge(opinion_sentence_agg, opinion_sentence_count, on=["Review id", "Sentence id"], how="inner")
# Merge opinion sentence with these aggregations on review and sentence id
opinion_sentence = pd.merge(opinion_sentence.drop(columns=["Opinion Word"]), opinion_sentence_agg, on=["Sentence id", "Review id"], how="inner")

# Create opinion proportion column
opinion_sentence["Opinion Proportion"] = opinion_sentence.loc[:, "Opinion Word"].div(opinion_sentence.loc[:, "Count"])
# Merge back into main dataframe
sent_word = pd.merge(sent_word, opinion_sentence.loc[:, ["Review id", "Sentence id", "Word id", "Opinion Proportion"]],
                     on=["Review id", "Sentence id", "Word id"], how="inner")

In [85]:
sent_word

Unnamed: 0,Review id,Artist,Album,Platform,ptag,ntag,Word,Tag,Word id,Desired,Sentence id,Word_Sentence id,album_tf,Opinion Word,Opinion Proportion
0,0,sleaford mods,key markets,Pitchfork,,NNP,title,NN,1,-1.0,0,1,0.004359,0,0.000000
1,0,sleaford mods,key markets,Pitchfork,NN,NNP,key,NNP,3,-1.0,0,3,0.005231,0,0.000000
2,0,sleaford mods,key markets,Pitchfork,NNP,NN,markets—,NNP,4,-1.0,0,4,0.001744,0,0.000000
3,0,sleaford mods,key markets,Pitchfork,NNP,NNP,something,NN,5,-1.0,0,5,0.001744,0,0.000000
4,0,sleaford mods,key markets,Pitchfork,NN,NNP,sleaford,NNP,7,-1.0,0,7,0.012206,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10167,713,bjork,fossora,NME,NN,NNP,emily,NNP,775,-1.0,20,33,0.000466,0,0.105263
10168,713,bjork,fossora,NME,NNP,CD,mackay,NNP,776,-1.0,20,34,0.000466,0,0.105263
10169,713,bjork,fossora,NME,NNP,NNP,29th,CD,777,-1.0,20,35,0.000466,0,0.105263
10170,713,bjork,fossora,NME,CD,CD,september,NNP,778,-1.0,20,36,0.000466,0,0.105263


In [74]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder

X = sent_word.drop(columns=["Desired"])
y = sent_word.loc[:, "Desired"]

sc = StandardScaler()
clf = SVC(gamma="auto")
enc = OneHotEncoder()

enc.fit(X.loc[:, ["ptag", "ntag", "Platform", "Tag"]])
one_hot = enc.transform(X.loc[:, ["ptag", "ntag", "Platform", "Tag"]]).toarray()
one_hot = pd.DataFrame(one_hot, columns=enc.get_feature_names_out())
X = pd.concat([X, one_hot], axis=1).drop(columns=["ptag", "ntag", "Word", "Tag", "Platform", "Artist", "Album", "Review id"])
X

Unnamed: 0,Word id,Sentence id,Word_Sentence id,album_tf,Opinion Word,Opinion Proportion,ptag_$,ptag_:,ptag_CC,ptag_CD,...,Tag_RP,Tag_VB,Tag_VBD,Tag_VBG,Tag_VBN,Tag_VBP,Tag_VBZ,Tag_WDT,Tag_WP,Tag_WP$
0,1,0,1,0.004359,0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,0,3,0.005231,0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4,0,4,0.001744,0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,0,5,0.001744,0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,0,7,0.012206,0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10167,775,20,33,0.000466,0,0.105263,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10168,776,20,34,0.000466,0,0.105263,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10169,777,20,35,0.000466,0,0.105263,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10170,778,20,36,0.000466,0,0.105263,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
sc.fit(X)
X = sc.transform(X)

In [76]:
clf.fit(X, y)
sent_word.loc[pd.Series(clf.predict(X)) == 1]

Unnamed: 0,Review id,Artist,Album,Platform,ptag,ntag,Word,Tag,Word id,Desired,Sentence id,Word_Sentence id,album_tf,Opinion Word,Opinion Proportion
3601,2,kanye west,yeezus,Pitchfork,POS,NNS,anti-consumerist,JJ,1559,1.0,54,10,0.000393,0,0.05
9706,537,bjork,fossora,Spectrum,NN,CD,fungi,$,1065,1.0,35,20,0.00233,0,0.095238


In [77]:
X = pd.DataFrame(X)
y_balanced = pd.concat([y.loc[y == 1], y.loc[y != 1].sample(300, random_state=0)], axis=0)
X_balanced = pd.concat([X.loc[y == 1], X.loc[y != 1].sample(300, random_state=0)], axis=0)

In [78]:
pd.set_option("display.max_rows", 313)
clf.fit(X_balanced, y_balanced)
pd.concat([sent_word.loc[y == 1], sent_word.loc[y != 1].sample(300, random_state=0)], axis=0).reset_index(drop=True).loc[pd.Series(clf.predict(X_balanced)).reset_index(drop=True) == 1]

Unnamed: 0,Review id,Artist,Album,Platform,ptag,ntag,Word,Tag,Word id,Desired,Sentence id,Word_Sentence id,album_tf,Opinion Word,Opinion Proportion
0,0,sleaford mods,key markets,Pitchfork,NN,NN,working,JJ,301,1.0,7,4,0.000872,0,0.0
1,0,sleaford mods,key markets,Pitchfork,JJ,NNS,class,NN,302,1.0,7,5,0.001744,0,0.0
2,0,sleaford mods,key markets,Pitchfork,NN,VBN,politics,NNS,304,1.0,7,7,0.002616,0,0.0
3,175,sleaford mods,key markets,Guardian,JJ,RB,anger,NNP,152,1.0,7,0,0.001744,0,0.190476
4,175,sleaford mods,key markets,Guardian,NN,VBP,humour,NN,166,1.0,7,14,0.000872,1,0.190476
5,363,sleaford mods,key markets,Spectrum,JJ,NN,socio-political,JJ,149,1.0,5,8,0.000872,0,0.166667
6,363,sleaford mods,key markets,Spectrum,JJ,NN,rant,NN,150,1.0,5,9,0.003487,1,0.166667
7,363,sleaford mods,key markets,Spectrum,JJ,VBD,life,NN,155,1.0,5,14,0.001744,0,0.166667
8,363,sleaford mods,key markets,Spectrum,VBN,NNP,humor,NN,170,1.0,5,29,0.002616,1,0.166667
9,363,sleaford mods,key markets,Spectrum,VBZ,NN,humor,NN,313,1.0,12,4,0.002616,1,0.121212


In [82]:
pd.read_hdf("C:\\Users\\tommy\\OneDrive\\University\\Year 3\\Third Year Project\\Platform Album Data\\new_unigram_data.h5", key="sentence")

Unnamed: 0,Artist,Album,Platform,Text,Review id
0,sleaford mods,key markets,Pitchfork,"[['The', 'title', 'of', 'Key', 'Markets—', 'so...",0
1,black midi,hellfire,Pitchfork,"[['The', 'preposterously', 'talented', 'Englis...",1
2,kanye west,yeezus,Pitchfork,"[['Marking', 'a', 'blunt', 'break', 'with', 't...",2
3,kehlani,it was good until it wasn't,Pitchfork,"[['The', 'cloudy', 'grooves', 'of', 'the', 'Oa...",3
4,blood orange,negro swan,Pitchfork,"[['Dev', 'Hynes’', 'fourth', 'album', 'as', 'B...",4
...,...,...,...,...,...
709,father john misty,fear fun,NME,"[['Having', 'officially', 'taken', 'leave', 'o...",709
710,beyonce,4,NME,"[['It’s', 'not', 'been', 'a', 'vintage', 'year...",710
711,toro y moi,anything in return,NME,"[['Chazwick', 'Bundick', 'aka', 'Toro', 'Y', '...",711
712,james blake,assume form,NME,"[['James', 'Blake', 'James', 'Blake', 'has', '...",712
