In [175]:
import sqlite3
import re
import string
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoLarsCV, ElasticNetCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, r2_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA

In [2]:
conn = sqlite3.connect('database.sqlite')

In [3]:
query = """
SELECT r.reviewid,
       r.title,
       r.artist,
       r.score,
       r.best_new_music,
       r.author,
       r.pub_date,
       c.content,
       g.genre
FROM reviews r
JOIN genres g ON g.reviewid = r.reviewid
JOIN content c ON c.reviewid = r.reviewid
ORDER BY pub_date;
"""

df = pd.read_sql(query, conn)
df['genre'] = df['genre'].fillna('other')

In [4]:
genre_dummies = pd.get_dummies(df['genre'], prefix='genre')
genre_dummies = pd.concat([df['reviewid'], genre_dummies], axis='columns')

def normalize_values(x):
    if x == 0:
        return 0
    else:
        return 1
    
genre_dummies = genre_dummies.groupby('reviewid').sum()
    
for col in genre_dummies.columns:
    if 'genre' in col:
        genre_dummies[col] = genre_dummies[col].apply(normalize_values)
        
df = df.groupby('reviewid').first().join(genre_dummies, 'reviewid').drop('genre', axis=1)

In [44]:
class MemorizingSnowballStemmer():
    
    def __init__(self, lang, **kwargs):
        self._stemmer = SnowballStemmer(lang, **kwargs)
        self._mem = {}
        
    def stem(self, word):
        stemmed_word = self._stemmer.stem(word)
        if stemmed_word in self._mem:
            if word not in self._mem[stemmed_word]:
                self._mem[stemmed_word].append(word)
        else:
            self._mem[stemmed_word] = [word]
        return stemmed_word
    
    def unstem(self, stem):
        if stem in self._mem:
            return self._mem[stem]
        else:
            return []

In [45]:
stemmer = MemorizingSnowballStemmer('english', ignore_stopwords=True)
strip_punct = str.maketrans('', '', string.punctuation)
stem_text = lambda s: ' '.join([stemmer.stem(word) for word in word_tokenize(s.translate(strip_punct))])
        
df['content_stemmed'] = df['content'].apply(stem_text)

In [46]:
df['content_stemmed'].head()

reviewid
1     aberfeldi record their debut young forev use a...
6     can there be any purpos behind a master degre ...
7     funni what get tag for popular comeback these ...
8     aarktica didnt seem to know what he was on 200...
10    abc are all about uniti their bandnam come fro...
Name: content_stemmed, dtype: object

In [47]:
vect = TfidfVectorizer(stop_words='english', min_df=10)
words = vect.fit_transform(df['content_stemmed'])

In [48]:
selector = SelectKBest(k=10000)
words_best = selector.fit_transform(words, df['score'])

In [182]:
target_columns = ['score', 'best_new_music'] + list(genre_dummies.columns)
targets = df[target_columns]
words_train, words_test, targets_train, targets_test = train_test_split(words_best, targets)

In [187]:
pca = PCA(n_components=20)
svc = LinearSVC()
clf = Pipeline([('pca',pca), ('svc',svc)])
clf.fit(words_train.toarray(), targets_train['genre_rock'])

Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=20,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('svc',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='squared_hinge', max_iter=1000,
                           multi_class='ovr', penalty='l2', random_state=None,
                           tol=0.0001, verbose=0))],
         verbose=False)

In [188]:
pred_train = clf.predict(words_train.toarray())
pred_test = clf.predict(words_test.toarray())
print('Training Accuracy: {}'.format(accuracy_score(targets_train['genre_rock'], pred_train)))
print('Test Accuracy: {}'.format(accuracy_score(targets_test['genre_rock'], pred_test)))
print('')
print('Training Precision: {}'.format(precision_score(targets_train['genre_rock'], pred_train)))
print('Test Precision: {}'.format(precision_score(targets_test['genre_rock'], pred_test)))

Training Accuracy: 0.7631063737219926
Test Accuracy: 0.7627229230100043

Training Precision: 0.7413054435483871
Test Precision: 0.7339587242026266


In [117]:
reg = LassoLarsCV(cv=5, n_jobs=8, eps=1e-7)
reg.fit(words_train.toarray(), targets_train['score'])

LassoLarsCV(copy_X=True, cv=5, eps=1e-07, fit_intercept=True, max_iter=500,
            max_n_alphas=1000, n_jobs=8, normalize=True, positive=False,
            precompute='auto', verbose=False)

In [118]:
pred_train = reg.predict(words_train)
pred_test = reg.predict(words_test)
print("Train Score: {}".format(r2_score(targets_train['score'], pred_train)))
print("Test Score: {}".format(r2_score(targets_test['score'], pred_test)))

Train Score: 0.35177502091337254
Test Score: 0.29364468256328957


In [119]:
coef = reg.coef_
temp = selector.inverse_transform(coef.reshape(1,-1))
terms = vect.inverse_transform(temp)

In [120]:
term_df = pd.DataFrame({
    'term': terms[0],
    'coef': coef[np.abs(coef)>0]
}).sort_values('coef')
print("Number of regressors: {}".format(term_df.shape[0]))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(term_df)

Number of regressors: 365
             term       coef
6      abercrombi -12.461979
166       insipid -12.248254
223        mötley  -9.813977
331      unfortun  -9.540172
339      unlisten  -9.109199
42          bland  -8.887414
360          wors  -7.650982
8           abysm  -7.192243
237      overseri  -7.186419
241       passabl  -6.827892
345         vapid  -6.674970
361         worst  -6.365376
83         decent  -6.228456
332      uninspir  -5.991359
262       problem  -5.940041
140     halfheart  -5.914727
25      audioslav  -5.825763
210       misguid  -5.730452
364      zaireeka  -5.682503
205       mediocr  -5.466679
27          avril  -5.220546
128       generic  -5.102485
123       forgett  -5.076517
252      platitud  -5.012756
24        attempt  -4.817741
48          blooz  -4.639327
105     embarrass  -4.601234
182          lack  -4.577727
326        turgid  -4.569312
199            m2  -4.322959
0              00  -4.287736
314         tepid  -4.257082
31          banal

In [55]:
def find_term(df, t):
    select = df['content_stemmed'].str.contains(t)
    return df[select]

In [137]:
test_df = find_term(df, 'best')

In [140]:
print(test_df)

                                                  title           artist  \
reviewid                                                                   
1                                         young forever        aberfeldy   
8                                        bleeding light         aarktica   
10                                                 abcs             abcs   
11                        homesick and happy to be here         aberdeen   
12                                              abilene          abilene   
...                                                 ...              ...   
22721                                      prelapsarian         krallice   
22722     insecure (music from the hbo original series)  various artists   
22724                      filthy america its beautiful          the lox   
22725                                         new start             taso   
22745                                  run the jewels 3   run the jewels   

          s

In [96]:
stemmer.unstem('chomski')

['Chomsky', 'Chomskys']

In [113]:
list(genre_dummies.columns)

['genre_electronic',
 'genre_experimental',
 'genre_folk/country',
 'genre_global',
 'genre_jazz',
 'genre_metal',
 'genre_other',
 'genre_pop/r&b',
 'genre_rap',
 'genre_rock']

In [189]:
vect.inverse_transform(selector.inverse_transform(np.abs(clf['pca'].components_) > 0.05))

[array(['album', 'artist', 'band', 'beat', 'chorus', 'collabor', 'danc',
        'dj', 'electron', 'group', 'guitar', 'hes', 'hiphop', 'hous',
        'indi', 'label', 'live', 'lyric', 'metal', 'music', 'play', 'pop',
        'punk', 'rap', 'rapper', 'record', 'remix', 'rock', 'sampl',
        'sing', 'song', 'songwrit', 'synth', 'techno', 'theyr', 'theyv',
        'track', 'vocal'], dtype='<U22'),
 array(['ambient', 'bad', 'bass', 'boy', 'build', 'compos', 'composit',
        'creat', 'dont', 'drone', 'drum', 'electron', 'element', 'fuck',
        'girl', 'good', 'got', 'guitar', 'guy', 'hes', 'hiphop', 'hous',
        'im', 'improvis', 'instrument', 'jazz', 'just', 'kid', 'know',
        'layer', 'life', 'loop', 'love', 'lyric', 'man', 'melodi', 'metal',
        'minut', 'mix', 'music', 'musician', 'na', 'nois', 'organ',
        'percuss', 'piano', 'piec', 'rap', 'rapper', 'record', 'rhyme',
        'rhythm', 'rock', 'say', 'shit', 'song', 'sound', 'space', 'synth',
        'synthes'

In [174]:
print(df[df['best_new_music'] == 1].iloc[2].content)

At the break in "Winters Love", from Animal Collective's feral, sentimental Sung Tongs, when the
    guitar picks up and Avey Tare and Panda Bear's voices flail like children who know they'll never die,
    it occurs to me that youth isn't always wasted on the young.  This is an old cliché, about how kids can
    never really appreciate the finer aspects of immaturity-- in fact, it's their immaturity and naiveté that
    all the gray adults crave like zombies.  It's a child's lack of self-conscience and "common sense" that
    make them holy, just as it's an adult's knowledge of their own mortality that makes them a little bit dead.
    However, even if the sickly, aching adults might long for days spent wasting precious time and forgetting
    precious lessons, most of them wouldn't turn back the clock if given the chance.  Youth isn't wasted on
    the young at all, because only kids on holiday could afford to leave their hearts exposed for so long,
    to sing as loudly and to take 