In [1]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import sqlite3
import os
import pickle
import re
import time
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thomasbritnell/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Data Acquisition

##### I tried web scraping of RateYourMusic to get more personalized data for me, but I kept running into CAPTCHA problems. Instead I am using pre-scraped data from Kaggle. This data is 18k+ album reviews scraped from Pitchfork, which is a popular website for music. It was in the form of an SQLlite DB but for processing I have turned it into a csv and pandas dataframe.

In [2]:
#Querying the sqlite db and making the dataframe
if not os.path.exists("reviews.csv"):
    conn = sqlite3.connect('database.sqlite')

    query = '''
    SELECT r.*, c.content
    FROM reviews r
    JOIN content c ON r.reviewid = c.reviewid
    '''

    df_og = pd.read_sql_query(query, conn)

    df_og.to_csv("reviews.csv", index=False)
    print("saved to file")

else:
    df_og = pd.read_csv("reviews.csv")
    print("read from file")

read from file


## Data Processing

##### Lowercase, removing stopwords, passing to TextBlob


In [None]:

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()                          # lowercase
    text = re.sub(r'[^\w\s]', '', text)          # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()     # collapse/strip whitespace
    return text

def remove_stopwords(word_list):
    return [w for w in word_list if w.lower() not in stop_words]


if (not os.path.exists("reviews_blobbed.csv")):
    start_time = time.time()

    #remove authortype before dropping na because that has the most missing and we dont need it
    df = df_og.drop(['author_type'],axis=1).dropna()

    # Clean the review content of punctuation and make it all lowercase
    df['content_clean'] = df['content'].apply(lambda x: preprocess_text(x))

    # Drop unnecessary columns (including now redundant author and content)
    drop_col = ["reviewid","url","best_new_music","content"]
    df = df.drop(drop_col,axis=1)

    # Apply text blob
    df['blob'] = df['content_clean'].apply(lambda x: TextBlob(str(x)))

    df['sentiment'] = df['blob'].apply(lambda x: x.sentiment)
    print(f"[INFO] Sentiment extracted. Elapsed: {time.time() - start_time:.2f}s")


    df['words'] = df['blob'].apply(lambda x: remove_stopwords(x.words))
    print(f"[INFO] Words extracted. Elapsed: {time.time() - start_time:.2f}s")

    df.to_csv("reviews_blobbed.csv", index=False)
    print(f"[INFO] DataFrame saved to reviews_blobbed.csv. Total elapsed: {time.time() - start_time:.2f}s")

else:
    df = pd.read_csv("reviews_blobbed.csv")
    print("read from file")


[INFO] Sentiment extracted. Elapsed: 41.48s
[INFO] Words extracted. Elapsed: 94.36s
[INFO] DataFrame saved to reviews_blobbed.csv. Total elapsed: 102.44s


In [7]:
df.head()

Unnamed: 0,title,artist,score,pub_date,pub_weekday,pub_day,pub_month,pub_year,author_id,content_clean,blob,sentiment,words
0,mezzanine,massive attack,9.3,2017-01-08,6,8,1,2017,0,triphop eventually became a 90s punchline a mu...,"(t, r, i, p, h, o, p, , e, v, e, n, t, u, a, ...","(0.09725585628363403, 0.48523097273097265)","[triphop, eventually, became, 90s, punchline, ..."
1,prelapsarian,krallice,7.9,2017-01-07,5,7,1,2017,1,eight years five albums and two eps in the new...,"(e, i, g, h, t, , y, e, a, r, s, , f, i, v, ...","(0.04265521064301552, 0.41637102734663706)","[eight, years, five, albums, two, eps, new, yo..."
2,all of them naturals,uranium club,7.3,2017-01-07,5,7,1,2017,2,minneapolis uranium club seem to revel in bein...,"(m, i, n, n, e, a, p, o, l, i, s, , u, r, a, ...","(0.12283243486073672, 0.4370283018867924)","[minneapolis, uranium, club, seem, revel, aggr..."
3,first songs,"kleenex, liliput",9.0,2017-01-06,4,6,1,2017,3,kleenex began with a crash it transpired one n...,"(k, l, e, e, n, e, x, , b, e, g, a, n, , w, ...","(0.1569563297222872, 0.4853611046164239)","[kleenex, began, crash, transpired, one, night..."
4,new start,taso,8.1,2017-01-06,4,6,1,2017,4,it is impossible to consider a given release b...,"(i, t, , i, s, , i, m, p, o, s, s, i, b, l, ...","(0.19372885256505948, 0.40957145345076384)","[impossible, consider, given, release, footwor..."
