In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from gensim.models.doc2vec import TaggedDocument
from sklearn.model_selection import train_test_split
import re

pd.set_option("display.max_columns",100)

def read_in_raw_data():
    df = pd.read_csv('../data/rt_data_dump.csv')
    return df

def drop_subpar_info(df):
    # Drop duplicate columns
    df.drop(['Unnamed: 0', 'rt_id.1', '_id'], axis=1, inplace=True)
    # Drop non-text annotations
    img_only_idxs = df[df['tate_text'].isna()].index
    df.drop(img_only_idxs, axis=0, inplace=True)
    # All songs are "False" -- therefore, this doesn't add anything!
    df.drop('hot_song', axis=1, inplace=True)

def drop_empty_text(df, in_tates=False, in_refs=False):
    # Drop rows with non-useful text
    if in_tates:
        is_null = df['tate_text'].isna()
        img_only_idxs = df[is_null].index
        df.drop(img_only_idxs, axis=0, inplace=True)
    if in_refs:
        is_empty_str = df['ref_text'].str.len() < 1
        empty_str_idxs = list(df[is_empty_str].index)
        df.drop(empty_str_idxs, axis=0, inplace=True)

def standardize_votes_col(df):
    # Create standardized "votes" feature (takes pageviews into account)
    df['votes_per_1000views'] = (100000 * df['votes_total'] / df['pageviews']).round(2)
    return df

def make_txt_length_features(df):
    df['chars_in_tate'] = df['tate_text'].str.len()
    df['chars_in_referent'] = df['ref_text'].str.len()
    df['ref_word_lst'] = df['ref_text'].str.lower().str.split()
    df['tate_word_lst'] = df['tate_text'].str.lower().str.split()
    df['ref_word_cnt'] = df['ref_word_lst'].str.len()
    df['tate_word_cnt'] = df['tate_word_lst'].str.len()
    return df

def remove_verse_tags_from_tate_text(df):
    ref_lines = list(df['ref_text'])
    # remove tag from lines that have both brackets
    for idx, line in enumerate(ref_lines):
        ref_lines[idx] = re.sub(r'\[.*?\]', '', line)
    df['ref_text'] = ref_lines
    # if ref_text now empty str, remove the entire row (whole ref was a tag)
    drop_empty_text(df, in_refs=True)
    drop_partial_tag_referents(df, ref_lines)
    return df

def drop_partial_tag_referents(df, ref_lines):
    p = re.compile('\[|\]')
    partial_tag_idxs = []
    for idx, line in enumerate(ref_lines):
        has_bracket = p.search(line) != None
        if has_bracket:
            partial_tag_idxs.append(idx)
    df.drop(partial_tag_idxs, axis=0, inplace=True)

def drop_select_parenthesis_referents(df):
    subpar_ref_texts = ['(21st-Century schizoid man)', 'Chorus', 'Justin Vernon', 'Kóbor János', 'Intro:', 'ENSEMBLE', 'JEFFERSON', 'Verse 2: Eminem', '[Chorus: KING GEORGE', '*Space Bar Tap*', 'BURR', 'LEE', '(Guitar Solo)']
    subpar_ref_idxs = []
    for subpar_ref in subpar_ref_texts:
        is_subpar = df['ref_text'] == subpar_ref
        subpar_idxs = list(df[is_subpar].index)
        for i in subpar_idxs:
            subpar_ref_idxs.append(i)
    df.drop(subpar_ref_idxs, axis=0, inplace=True)


In [2]:
df = read_in_raw_data()
drop_subpar_info(df)
drop_empty_text(df, in_tates=True)
df = standardize_votes_col(df)
df = make_txt_length_features(df)
df = remove_verse_tags_from_tate_text(df)
drop_select_parenthesis_referents(df)

In [3]:
df.shape

(3178, 28)

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from gensim.test.utils import common_texts, get_tmpfile
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import spacy
from sklearn.model_selection import train_test_split
import gensim
import os
import collections
import smart_open
import random
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
import re

In [7]:
def make_train_test_split(df, get_holdout=False):
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    if get_holdout:
        holdout_df = test_df.copy()
        train_df.to_csv('../data/genius_train_data_320i.csv')
        holdout_df.to_csv('../data/genius_holdout_data_320i.csv')
        return train_df
    else:
        return train_df, test_df

def get_ref_tate_dfs(df):
    ref_df = df[['ref_text', 'rt_id']]
    tate_df = df[['tate_text', 'rt_id']]
    ref_df.reset_index(drop=True, inplace=True)
    tate_df.reset_index(drop=True, inplace=True)
    return ref_df, tate_df

def make_combined_rt_df(ref_df, tate_df, is_train=True):
    ref_df.columns = ['text', 'rt_id']
    tate_df.columns = ['text', 'rt_id']
    ref_df['is_ref'] = 1
    tate_df['is_ref'] = 0
    rt_df = pd.concat([ref_df, tate_df], axis=0)
    rt_df.reset_index(drop=True, inplace=True)
    return rt_df

def isolate_corpuses(txt_train_df, txt_test_df):
    text_train = txt_train_df['text']
    text_test = txt_test_df['text']
    return text_train, text_test

def make_corpus_dicts(ref_train_df, ref_test_df, tate_train_df, tate_test_df, rt_train_df, rt_test_df):
    txt_dfs = [(ref_train_df, ref_test_df), (tate_train_df, tate_test_df), (rt_train_df, rt_test_df)]
    corpus_cats = ['referent', 'tate', 'ref-tate']
    corpus_dict = dict()
    for idx, txt_type in enumerate(txt_dfs):
        corpus_type = corpus_cats[idx]
        txt_train_df, txt_test_df = txt_type
        text_train, text_test = isolate_corpuses(txt_train_df, txt_test_df)
        corpus_dict[corpus_type] = (text_train, text_test)
    return corpus_dict

def make_rt_doc_dicts(doc_train_df, doc_test_df):
    doc_idx_train = doc_train_df['rt_id']
    doc_idx_test = doc_test_df['rt_id']

    doc_idx_train_dict = doc_idx_train.to_dict()
    doc_idx_test_dict = doc_idx_test.to_dict()
    return doc_idx_train_dict, doc_idx_test_dict

def get_token_lst(corpus, tagged_docs=False, include_ref_tag=False):
    # corpus = list(corpus)
    for idx, line in enumerate(corpus):
        # Returns list of strings where contraction words, newlines, and punctuation is preserved
        tokens = re.findall(r"[\w'|\w’]+|[-–()\"\“\”.,!?;]+|[\n]+", line)
        tokens = [x.lower() for x in tokens]
        if tagged_docs:
            is_tate = 0 if idx < 2033 else 1
            tag_lst = [idx, is_tate] if include_ref_tag else [idx]
            yield gensim.models.doc2vec.TaggedDocument(tokens, tag_lst)
        else:
            yield tokens

def get_train_test_corpuses(txt_series_train, txt_series_test, include_ref_tag=False):
    train_corpus = list(get_token_lst(txt_series_train, tagged_docs=True, include_ref_tag=include_ref_tag))
    test_corpus = list(get_token_lst(txt_series_test))
    return train_corpus, test_corpus  
    

In [8]:
# df = preprocessing.main()
data = make_train_test_split(df, get_holdout=True)
train_df, test_df = make_train_test_split(data, get_holdout=False)
ref_train_df, tate_train_df = get_ref_tate_dfs(train_df)
ref_test_df, tate_test_df = get_ref_tate_dfs(test_df)
rt_train_df = make_combined_rt_df(ref_train_df, tate_train_df, is_train=True)
rt_test_df = make_combined_rt_df(ref_test_df, tate_test_df, is_train=False)

corpus_dict = make_corpus_dicts(ref_train_df, ref_test_df, tate_train_df, tate_test_df, rt_train_df, rt_test_df)

doc_rt_train_dict, doc_rt_test_dict = make_rt_doc_dicts(ref_train_df, ref_test_df)
full_train_dict, full_test_dict = make_rt_doc_dicts(rt_train_df, rt_test_df)

ref_train_pcorpus, ref_test_pcorpus = list(get_train_test_corpuses(corpus_dict['referent'][0], corpus_dict['referent'][1]))
tate_train_pcorpus, tate_test_pcorpus = list(get_train_test_corpuses(corpus_dict['tate'][0], corpus_dict['tate'][1]))
rt_train_pcorpus, rt_test_pcorpus = list(get_train_test_corpuses(corpus_dict['ref-tate'][0], corpus_dict['ref-tate'][1]))
rt_tagged_train_pcorpus, rt_tagged_test_pcorpus = list(get_train_test_corpuses(corpus_dict['ref-tate'][0], corpus_dict['ref-tate'][1], include_ref_tag=True))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Okay, so I've checked, and tagging is working! Yay

Note: 0 = referent, 1 = annotation

In [9]:
rt_tagged_train_pcorpus[4000]

TaggedDocument(words=['the', 'use', 'of', '“', 'rewind', '”', 'is', 'an', 'anachronism', '.', 'they', 'had', 'no', 'tapes', 'or', 'film', 'to', 'rewind', 'at', 'the', 'time', '.'], tags=[4000, 1])

In [10]:
len(ref_test_pcorpus)

509

**GREAT! Text matches up between indices across corpus_dict and _pcorpus variations**

In [12]:
# Check to see how idxs line up between corpus_dict and pcorpus lists
print(rt_train_pcorpus[10].words)
print(corpus_dict['ref-tate'][0][10])

['god', 'damn', ',', 'got', 'a', 'bald', 'fade', ',', 'i', 'might', 'slam']
God damn, got a bald fade, I might slam


**Alright, before we make & train models, let's make a couple more dictionaries (along with the rt_id mixing function)**

In [19]:
def make_doc_rt_dic(doc_rt_train_dict, doc_rt_test_dict):
    rt_doc_train_dict = dict((v,k) for k,v in doc_rt_train_dict.items())
    rt_doc_test_dict = dict((v,k) for k,v in doc_rt_test_dict.items())
    return rt_doc_train_dict, rt_doc_test_dict

def rt_in_train_dic(data, train_df):
    # need to make a dict that will tell us if an rt_id is in train or test set
    rt_in_training_dict = dict()
    for rt_id in data['rt_id'].unique():
        if rt_id in list(train_df['rt_id']):
            rt_in_training_dict[rt_id] = True
        else:
            rt_in_training_dict[rt_id] = False
    return rt_in_training_dict

def artist_rt_dic(data, artists):
    artist_rt_dict = dict()
    for artist in artists:
        is_artist = data['artist_name'] == artist
        rt_ids = data[is_artist]['rt_id'].unique()
        artist_rt_dict[artist] = rt_ids
    return artist_rt_dict

def rt_artist_dic(artist_rt_dict):
    rt_artist_dict = dict()
    for k,v in artist_rt_dict.items():
        for i in v:
            rt_artist_dict[i] = k
    return rt_artist_dict

def print_annotations_per_artist(artist_rtid_dict):
    print("ANNOTATIONS PER ARTIST:\n")
    for k, v in artist_rt_dict.items():
        print(k + ': ' + str(len(v)))

In [157]:
artists = data['artist_name'].unique()

rt_doc_train_dict, rt_doc_test_dict = make_doc_rt_dicts(doc_rt_train_dict, doc_rt_test_dict)
rt_in_training_dict = rt_in_train_dic(data, train_df)
artist_rt_dict = artist_rt_dic(data, artists)
rt_artist_dict = rt_artist_dic(artist_rt_dict)
print_annotations_per_artist(artist_rt_dict)

ANNOTATIONS PER ARTIST:

Original Broadway Cast of Hamilton: 334
Ariana Grande: 211
JAY-Z: 123
Drake: 227
Beyoncé: 82
The Weeknd: 224
XXXTENTACION: 205
J. Cole: 296
Eminem: 309
Kendrick Lamar: 225
Kanye West: 208
Lil Wayne: 98


In [158]:
def get_true_rt_pairs(train_df):
    pair_df = train_df[['rt_id', 'artist_name']]

    pair_df['ref_rtid'] = pair_df['rt_id'].copy()
    pair_df['tate_rtid'] = pair_df['rt_id'].copy()
    pair_df['ref_artist'] = pair_df['artist_name'].copy()
    pair_df['tate_artist'] = pair_df['artist_name'].copy()

    return pair_df

In [159]:
pd.options.mode.chained_assignment = None  # default='warn'

In [160]:
pair_df = get_true_rt_pairs(train_df)
pair_df.head()

Unnamed: 0,rt_id,artist_name,ref_rtid,tate_rtid,ref_artist,tate_artist
844,2552277,Beyoncé,2552277,2552277,Beyoncé,Beyoncé
721,2233756,Drake,2233756,2233756,Drake,Drake
1257,5029023,J. Cole,5029023,5029023,J. Cole,J. Cole
1757,8663001,Kanye West,8663001,8663001,Kanye West,Kanye West
2716,12365147,JAY-Z,12365147,12365147,JAY-Z,JAY-Z


In [166]:
x = np.array([])

In [167]:
x

array([], dtype=float64)

In [168]:
def get_mismatched_rt_pairs(pair_df, rt_artist_dict):
    mixed_pair_df = pair_df[['ref_rtid', 'tate_rtid', 'ref_artist', 'tate_artist']].copy()

    ref_ids = np.array([mixed_pair_df['ref_rtid']]).reshape(-1,)
    tate_ids = np.array([mixed_pair_df['tate_rtid']]).reshape(-1,)

    shuffled_ref_rt, shuffled_tate_rt = shuffle_rtids(ref_ids, tate_ids)
    mism_ref_rt = np.array([])
    mism_tate_rt = np.array([])
    cnt = 0
    while shuffled_ref_rt.shape[0] > 0:
        mism_ref_rt, mism_tate_rt, shuffled_ref_rt, shuffled_tate_rt = separate_same_artist_rtids(shuffled_ref_rt, shuffled_tate_rt, rt_artist_dict, mism_tate_rt, mism_ref_rt)
        cnt += 1
        if cnt > 15:
            mism_ref_rt = np.concatenate([mism_ref_rt, shuffled_ref_rt])
            mism_tate_rt = np.concatenate([mism_tate_rt, shuffled_tate_rt])
            break

    return mism_ref_rt, mism_tate_rt

def shuffle_rtids(ref_ids, tate_ids):
    n = ref_ids.shape[0]
    shuffled_ref_rt = np.random.choice(ref_ids, n, replace=False)
    shuffled_tate_rt = np.random.choice(tate_ids, n, replace=False)
    return shuffled_ref_rt, shuffled_tate_rt

def separate_same_artist_rtids(shuffled_ref_rt, shuffled_tate_rt, rt_artist_dict, mism_tate_rt, mism_ref_rt):
    shuff_ref_artist = np.array([rt_artist_dict[rtid] for rtid in shuffled_ref_rt])
    shuff_tate_artist = np.array([rt_artist_dict[rtid] for rtid in shuffled_tate_rt])

    diff_artist = shuff_tate_artist != shuff_ref_artist
    mism_tate_rt = mism_tate_rt
    mism_ref_rt = mism_ref_rt
    mism_ref_rt = np.concatenate([mism_ref_rt, shuffled_ref_rt[diff_artist]])
    mism_tate_rt = np.concatenate([mism_tate_rt, shuffled_tate_rt[diff_artist]])

    shuffled_ref_rt = shuffled_ref_rt[~diff_artist]
    shuffled_tate_rt = shuffled_tate_rt[~diff_artist]

    return mism_ref_rt, mism_tate_rt, shuffled_ref_rt, shuffled_tate_rt





In [169]:
shuff_ref_rt, shuff_tate_rt = get_mismatched_rt_pairs(pair_df, rt_artist_dict)

print(len(shuff_ref_rt))
print(len(shuff_tate_rt))

2033
2033


In [171]:
print(shuff_ref_rt[:10])
print(shuff_tate_rt[:10])

[ 8680419. 10110962. 12583966. 14296661. 11035363.   290979.  5049708.
  3943027. 12922232.  9424113.]
[16448444.  1681237. 11593217. 10978998. 12582091. 16425506. 11675883.
 11141972.  8093284. 10555447.]


In [170]:
mism_df = pd.DataFrame({'ref_id': shuff_ref_rt, 'tate_id': shuff_tate_rt})
mism_df.head()

Unnamed: 0,ref_id,tate_id
0,8680419.0,16448444.0
1,10110962.0,1681237.0
2,12583966.0,11593217.0
3,14296661.0,10978998.0
4,11035363.0,12582091.0


In [174]:
shuff_ref_artist = np.array([rt_artist_dict[rtid] for rtid in mism_df['ref_id']])
shuff_tate_artist = np.array([rt_artist_dict[rtid] for rtid in mism_df['tate_id']])

In [175]:
mism_df['ref_artist'] = shuff_ref_artist
mism_df['tate_artist'] = shuff_tate_artist
mism_df.head()

Unnamed: 0,ref_id,tate_id,ref_artist,tate_artist
0,8680419.0,16448444.0,Kendrick Lamar,Ariana Grande
1,10110962.0,1681237.0,Original Broadway Cast of Hamilton,Drake
2,12583966.0,11593217.0,XXXTENTACION,Kendrick Lamar
3,14296661.0,10978998.0,Drake,J. Cole
4,11035363.0,12582091.0,The Weeknd,XXXTENTACION


In [177]:
rt_in_training_dict

'Ariana Grande'

In [178]:
shuff_ref_in_training = np.array([rt_in_training_dict[rtid] for rtid in mism_df['ref_id']])
shuff_tate_in_training = np.array([rt_in_training_dict[rtid] for rtid in mism_df['tate_id']])

mism_df['ref_in_train'] = shuff_ref_in_training
mism_df['tate_in_train'] = shuff_tate_in_training
mism_df.head()

Unnamed: 0,ref_id,tate_id,ref_artist,tate_artist,ref_in_train,tate_in_train
0,8680419.0,16448444.0,Kendrick Lamar,Ariana Grande,True,True
1,10110962.0,1681237.0,Original Broadway Cast of Hamilton,Drake,True,True
2,12583966.0,11593217.0,XXXTENTACION,Kendrick Lamar,True,True
3,14296661.0,10978998.0,Drake,J. Cole,True,True
4,11035363.0,12582091.0,The Weeknd,XXXTENTACION,True,True


In [188]:
shuff_ref_doc_id = []
for rtid in mism_df['ref_id']:
    at_rtid = mism_df['ref_id'] == rtid
    if mism_df[at_rtid]['ref_in_train'].bool():
        doc_id = rt_doc_train_dict[rtid]
    else:
        doc_id = rt_doc_test_dict[rtid]
    shuff_ref_doc_id.append(doc_id)
    
shuff_tate_doc_id = []
for rtid in mism_df['tate_id']:
    at_rtid = mism_df['tate_id'] == rtid
    if mism_df[at_rtid]['ref_in_train'].bool():  
        doc_id = rt_doc_train_dict[rtid]
    else:
        doc_id = rt_doc_test_dict[rtid]
    shuff_tate_doc_id.append(doc_id)    
    
mism_df['ref_doc_id'] = shuff_ref_doc_id
mism_df['tate_doc_id'] = shuff_tate_doc_id
mism_df.head()

Unnamed: 0,ref_id,tate_id,ref_artist,tate_artist,ref_in_train,tate_in_train,ref_doc_id,tate_doc_id
0,8680419.0,16448444.0,Kendrick Lamar,Ariana Grande,True,True,62,737
1,10110962.0,1681237.0,Original Broadway Cast of Hamilton,Drake,True,True,1986,1721
2,12583966.0,11593217.0,XXXTENTACION,Kendrick Lamar,True,True,622,1642
3,14296661.0,10978998.0,Drake,J. Cole,True,True,32,1786
4,11035363.0,12582091.0,The Weeknd,XXXTENTACION,True,True,154,807


In [193]:
shuff_ref_raw_txt = []
for doc_id in mism_df['ref_doc_id']:
    at_doc_id = mism_df['ref_doc_id'] == doc_id
    if mism_df[at_doc_id]['ref_in_train'].bool():      
        raw_txt = corpus_dict['referent'][0][doc_id]
    else:
        raw_txt = corpus_dict['referent'][1][doc_id]
    shuff_ref_raw_txt.append(raw_txt)
    
shuff_tate_raw_txt = []
for doc_id in mism_df['tate_doc_id']:
    at_doc_id = mism_df['tate_doc_id'] == doc_id
    if mism_df[at_doc_id]['ref_in_train'].bool():     
        raw_txt = corpus_dict['tate'][0][doc_id]
    else:
        raw_txt = corpus_dict['tate'][1][doc_id]
    shuff_tate_raw_txt.append(raw_txt)    
    
mism_df['ref_raw_text'] = shuff_ref_raw_txt
mism_df['tate_raw_text'] = shuff_tate_raw_txt
mism_df.head()

Unnamed: 0,ref_id,tate_id,ref_artist,tate_artist,ref_in_train,tate_in_train,ref_doc_id,tate_doc_id,ref_raw_text,tate_raw_text
0,8680419.0,16448444.0,Kendrick Lamar,Ariana Grande,True,True,62,737,"Better yet, where your friends and 'em?\n I re...",Ariana compares her tumbling relationship to a...
1,10110962.0,1681237.0,Original Broadway Cast of Hamilton,Drake,True,True,1986,1721,Boooo!,"Regardless of how many people she is seeing, D..."
2,12583966.0,11593217.0,XXXTENTACION,Kendrick Lamar,True,True,622,1642,"Only time I feel pain, when I'm feelin' love",Hip-hop has a long standing relationship with ...
3,14296661.0,10978998.0,Drake,J. Cole,True,True,32,1786,"Hoes talk down, you don't see 'em outside\n Ye...",“Head game” is a slang expression for how good...
4,11035363.0,12582091.0,The Weeknd,XXXTENTACION,True,True,154,807,"Why don't you shake some, shake somethin'\n Fo...",X has never made it explicitly clear what his ...


In [200]:
mism_df[mism_df['ref_doc_id'] > 2031]

Unnamed: 0,ref_id,tate_id,ref_artist,tate_artist,ref_in_train,tate_in_train,ref_doc_id,tate_doc_id,ref_raw_text,tate_raw_text
383,12587504.0,7669871.0,XXXTENTACION,The Weeknd,True,True,2032,324,So this is the end\n Waste of tears\n Waste of...,The Weeknd recalled the way his mom looked at ...


**Thank god, looks like we only have doc_ids for 0-2032, so that works with the corpuses...**

In [201]:
shuff_ref_pp_txt = []
for doc_id in mism_df['ref_doc_id']:
    at_doc_id = mism_df['ref_doc_id'] == doc_id
    if mism_df[at_doc_id]['ref_in_train'].bool():  
        pp_txt = ref_train_pcorpus[doc_id].words
    else:
        pp_txt = ref_test_pcorpus[doc_id]
    shuff_ref_pp_txt.append(pp_txt)
    
shuff_tate_pp_txt = []
for doc_id in mism_df['tate_doc_id']:
    at_doc_id = mism_df['tate_doc_id'] == doc_id
    if mism_df[at_doc_id]['ref_in_train'].bool():     
        pp_txt = tate_train_pcorpus[doc_id].words
    else:
        pp_txt = tate_test_pcorpus[doc_id].words
    shuff_tate_pp_txt.append(pp_txt)    
    
mism_df['ref_pp_text'] = shuff_ref_pp_txt
mism_df['tate_pp_text'] = shuff_tate_pp_txt
mism_df.head()

Unnamed: 0,ref_id,tate_id,ref_artist,tate_artist,ref_in_train,tate_in_train,ref_doc_id,tate_doc_id,ref_raw_text,tate_raw_text,ref_pp_text,tate_pp_text
0,8680419.0,16448444.0,Kendrick Lamar,Ariana Grande,True,True,62,737,"Better yet, where your friends and 'em?\n I re...",Ariana compares her tumbling relationship to a...,"[better, yet, ,, where, your, friends, and, 'e...","[ariana, compares, her, tumbling, relationship..."
1,10110962.0,1681237.0,Original Broadway Cast of Hamilton,Drake,True,True,1986,1721,Boooo!,"Regardless of how many people she is seeing, D...","[boooo, !]","[regardless, of, how, many, people, she, is, s..."
2,12583966.0,11593217.0,XXXTENTACION,Kendrick Lamar,True,True,622,1642,"Only time I feel pain, when I'm feelin' love",Hip-hop has a long standing relationship with ...,"[only, time, i, feel, pain, ,, when, i'm, feel...","[hip, -, hop, has, a, long, standing, relation..."
3,14296661.0,10978998.0,Drake,J. Cole,True,True,32,1786,"Hoes talk down, you don't see 'em outside\n Ye...",“Head game” is a slang expression for how good...,"[hoes, talk, down, ,, you, don't, see, 'em, ou...","[“, head, game, ”, is, a, slang, expression, f..."
4,11035363.0,12582091.0,The Weeknd,XXXTENTACION,True,True,154,807,"Why don't you shake some, shake somethin'\n Fo...",X has never made it explicitly clear what his ...,"[why, don't, you, shake, some, ,, shake, somet...","[x, has, never, made, it, explicitly, clear, w..."


In [202]:
shuff_ref_corpus = list(mism_df['ref_pp_text'])
shuff_tate_corpus = list(mism_df['tate_pp_text'])

shuff_ref_corpus[:10]

[['better',
  'yet',
  ',',
  'where',
  'your',
  'friends',
  'and',
  "'em",
  '?',
  '\n',
  'i',
  'really',
  'wanna',
  'know',
  'you',
  'all'],
 ['boooo', '!'],
 ['only', 'time', 'i', 'feel', 'pain', ',', 'when', "i'm", "feelin'", 'love'],
 ['hoes',
  'talk',
  'down',
  ',',
  'you',
  "don't",
  'see',
  "'em",
  'outside',
  '\n',
  'yeah',
  ',',
  'they',
  "don't",
  'really',
  'be',
  'the',
  'same',
  'offline'],
 ['why',
  "don't",
  'you',
  'shake',
  'some',
  ',',
  'shake',
  "somethin'",
  '\n',
  'for',
  'the',
  'don',
  '?',
  "don't",
  'you',
  'break',
  "nothin'",
  ',',
  'break',
  "nothin'",
  '\n',
  'baby',
  'girl',
  ',',
  "won't",
  'you',
  'work',
  'some',
  ',',
  'work',
  "somethin'",
  '\n',
  'for',
  'the',
  'don',
  '?',
  "don't",
  'you',
  'hurt',
  "nothin'",
  ',',
  'hurt',
  "nothin'",
  '\n',
  'big',
  'girl',
  ',',
  "won't",
  'you',
  'shake',
  'some',
  ',',
  'shake',
  "somethin'",
  '(',
  'shake',
  'some',
  ')'