In [192]:
import json
import os
from time import time
# import re
import pandas as pd
import numpy as np
# from nltk import FreqDist
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten, Reshape
from keras.optimizers import SGD 

from collections import Counter
from collections import OrderedDict


## Preprocessing

In [193]:
# takes path, returns list with of albums in json format
# each json entry is a song. 
def json_extract(path):
    data_list=[]
    for file in os.listdir(path): 
        if file[-5:] == '.json':
            with open(path+file, 'r') as f: 
                data = json.load(f)
                data_list.append(data)
    return data_list

In [194]:
path = 'data/drake/'
drake=json_extract(path)
path='data/quentin_miller/'
quentin=json_extract(path)

In [195]:
d_count=0
q_count=0
iyrtitl=[]
iyrtitl_set=[]
for album in drake: 
    for song in album: 
        if song["album"]=="If You’re Reading This It’s Too Late ":
            iyrtitl.append(song['title'])
            iyrtitl_set.append(song['lyrics'])
        else: 
            d_count +=1

for album in quentin:
    for song in album:
        q_count +=1
    

In [196]:
print(d_count, q_count, d_count+q_count)

242 138 380


In [197]:
# store data about each document in a seperate dictionary with keys as titles and and value containing title, album, and artist info. 
# helpful for checking dataframe of corpus and test set against album info. 
class Doc_info: 
    def __init__(self, title: str, album : str, artist : str):
        self.title = title
        self.album = album
        self.artist = artist



In [198]:
np.random.seed(1)
corpus = []
iyrtitl= []
test_set = []
iyrtitl_set=[]
corpus_doc_ls = []
test_doc_ls = []
d_test_cnt = 0
d_corp_cnt=0

for album in drake: 
    for song in album: 
        # keep track of "If you're..." to put in a separate set (iytitl is our ambiguous authorship)
        if song["album"]=="If You’re Reading This It’s Too Late ":
            iyrtitl.append(song['title'])
            iyrtitl_set.append(song['lyrics'])
        else: 
            # place more drake songs in test set to better balance training sample
            if np.random.rand(1) < 0.15: 
                test_set.append(song['lyrics'])
                test_doc_ls.append(Doc_info(song['title'], song['album'], 'drake'))
                d_test_cnt+=1
            else: 
                corpus.append(song['lyrics'])
                corpus_doc_ls.append(Doc_info(song['title'], song['album'], 'drake'))
                d_corp_cnt+=1
                
for album in quentin: 
    for song in album: 
        if np.random.rand(1) < 0.1: 
            test_set.append(song['lyrics'])
            test_doc_ls.append(Doc_info(song['title'], song['album'], 'quentin'))
        else: 
            corpus.append(song['lyrics'])
            corpus_doc_ls.append(Doc_info(song['title'], song['album'], 'quentin'))

     

In [199]:
y_train = np.zeros(len(corpus))
y_train[d_corp_cnt:] =1
y_test = np.zeros(len(test_set))
y_test[d_test_cnt:] =1

In [None]:
print("quentin songs found:", len(quentin))
print("drake songs found:", len(drake))

In [200]:
#stops used to remove obvious predictive words.
stops=['drizzy', 'drake', 'quentin', 'miller', 'ovo', 'champagne', 'papi','toronto', 'atlanta', '6']
#used to get rid of common non-contexual words to focus which contextual words are most influential. 
stops2=stopwords.words("English")

In [201]:
# keep track of accuracy score of all 6 models 
score_dict = {}
# track words that are best "drake" predictors, "miller" predictors, and general predictors
drake_tokens=[]
quentin_tokens=[]
total_tokens=[]
# track the test set predictions for held out if you're reading this its too late 
df=pd.DataFrame(np.zeros(len(iyrtitl)), index=iyrtitl, columns=["credits"])
credits = ['10 Bands', "Legend", "Know Yourself", "Used To"]
for name in credits:
    df.loc[name]=1
df

Unnamed: 0,credits
Legend,1.0
Energy,0.0
10 Bands,1.0
Know Yourself,1.0
No Tellin’,0.0
Madonna,0.0
6 God,0.0
Star67,0.0
Preach,0.0
Used To,1.0


In [202]:
corpus[1]

'I don\'t have a fuck to give, I\'ve been moving state to state\nIn my leather and my Timbs like it\'s 1998\nAnd my dog Chubby Chub, that\'s my nigga from the way\nOn the Eastside of the city, that\'s where everybody stay\nSeem like everybody calling \'cause they want me on their song\nIt\'s like every time I touch it I could never do no wrong\nWhen they need a favor from you, man, they don\'t leave you alone\nBut I guess that\'s just the motion, yeah\nYeah, looking back on it, at least my pride is intact\n\'Cause we said "no strings attached" and I still got tied up in that\nEverything that I write is either for her or about her\nSo I\'m with her even when I\'m here without her and she know it\nThe girl that I wanna save is like a danger to my health\nTry being with somebody that wanna be somebody else\nI always thought she was perfect when she was being herself\nDon\'t even know how to help, but I guess that\'s just the motion, yeah\n'

In [203]:
# tf= TfidfTransformer(use_idf=True, norm='l1')
# c=["one four two three four four four", "four five five seven", "one two three four five six seven eight nine"]
# v1 = cv.fit_transform(c)
# d=pd.DataFrame(v1.toarray(), columns=cv.get_feature_names(), index = ['count0', 'count1', 'count2'])
# v2=tf.fit_transform(v1.toarray())
# d=d.append(pd.DataFrame(v2.toarray(), columns=cv.get_feature_names(),  index = ['tf0', 'tf1', 'tf2']))
# d
# # np.max(v1.toarray() - v2.toarray())
# # np.max(v1.toarray())

In [204]:
analyze = CV().build_analyzer()
ps = PorterStemmer()

In [205]:
def stemming(doc): 
    return (ps.stem(w) for w in analyze(doc))

In [206]:
# takes a pipeline, hyperparameters, and a number of folds. 
# prints information about the grid search and returns the GridSearchCV object with the best model
def grid_search(pipeline, param, k):
    grid = GridSearchCV(pipeline, param, cv=k, n_jobs=4, verbose =1)
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:", param)
    start = time()
    grid.fit(corpus, y_train)
    print("done in %0.3fs" % (time() - start))
    print("Best score: %0.3f" % grid.best_score_)
    print("Best parameters set:")
    for param_name in sorted(param.keys()):
        print("\t%s: %r" % (param_name, grid.best_params_[param_name]))
    return grid

## Logistic Regression

In [207]:
pipeline = Pipeline(steps=[
    ('vect', CV(analyzer=stemming, stop_words=stops, max_df=0.4)),
    ('tfidf', TfidfTransformer(use_idf=True, norm='l1')),
    ('clf', LogisticRegression(class_weight='balanced', penalty='elasticnet', solver='saga'))
])

param= {
#     'vect__max_df': (.4, .5), 
#     'vect__min_df': (3, 5, 7), 
    'vect__ngram_range': ((1, 1), (1, 2)),
#     'vect__stop_words': (stops, stops2),/
#     'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1', 'l2'),
    'clf__l1_ratio': (0.1, 0,5, 0.9, 0.95),
    'clf__C' : (0.01, 0.1, 1., 10),
}

In [208]:
grid=grid_search(pipeline, param, 3)
best_estimator=grid.best_estimator_

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters: {'vect__ngram_range': ((1, 1), (1, 2)), 'clf__l1_ratio': (0.1, 0, 5, 0.9, 0.95), 'clf__C': (0.01, 0.1, 1.0, 10)}
Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   48.7s
[Parallel(n_jobs=4)]: Done 120 out of 120 | elapsed:  2.1min finished


done in 128.407s
Best score: 0.842
Best parameters set:
	clf__C: 10
	clf__l1_ratio: 0
	vect__ngram_range: (1, 2)


In [211]:
best_estimator.score(best_estimator.predict(test_set), y_test)

AttributeError: 'numpy.float64' object has no attribute 'lower'

In [13]:
score_dict['log']=grid.best_score_

In [14]:
test=best_estimator['vect'].transform(iyrtitl_set)
test=best_estimator['tfidf'].transform(test)
df['log_drake']=best_estimator['clf'].predict_proba(test)[:,0]
df['log_quen']=best_estimator['clf'].predict_proba(test)[:,1]

In [15]:
df

Unnamed: 0,credits,log_drake,log_quen
Legend,1.0,0.577247,0.422753
Energy,0.0,0.504775,0.495225
10 Bands,1.0,0.50085,0.49915
Know Yourself,1.0,0.517119,0.482881
No Tellin’,0.0,0.503975,0.496025
Madonna,0.0,0.528818,0.471182
6 God,0.0,0.540827,0.459173
Star67,0.0,0.527664,0.472336
Preach,0.0,0.544037,0.455963
Used To,1.0,0.528538,0.471462


In [16]:
cdf = pd.DataFrame(best_estimator['clf'].coef_.T, 
                   best_estimator['vect'].get_feature_names(), 
                   columns=['Coefficients']).sort_values(['Coefficients'])
drake_tokens.append(cdf.index.tolist()[:15])
total_tokens.append(cdf.index.tolist()[:15])
total_tokens.append(cdf.index.tolist()[-15:])
quentin_tokens.append(cdf.index.tolist()[-15:])

## Stochastic Gradient Decent

In [17]:
pipeline = Pipeline(steps=[
    ('vect', CV(analyzer=stemming, stop_words=stops, max_df=0.4)),
    # every iteration has yielded tfidf-l2 as the best for prediction. 
    ('tfidf', TfidfTransformer(use_idf=True, norm='l1')),
    ('clf', SGDClassifier(class_weight='balanced', loss= 'modified_huber'))
])

params = {    
#     'vect__min_df': (3, 5, 7), 
    'vect__ngram_range': ((1, 1), (1, 2)),
#     'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1', 'l2'),
#     'clf__loss' : ('perceptron'),
    "clf__alpha" : (0.0001, 0.001, 0.01),
#     'clf__l1_ratio' : (0, 0.5, 0.9, 1),
#     'clf__tol' : (0.0001, 0.001),
}


In [18]:
grid=grid_search(pipeline, params, 3)


Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters: {'vect__ngram_range': ((1, 1), (1, 2)), 'clf__alpha': (0.0001, 0.001, 0.01)}
Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  18 out of  18 | elapsed:   15.0s finished


done in 16.937s
Best score: 0.853
Best parameters set:
	clf__alpha: 0.001
	vect__ngram_range: (1, 2)


In [19]:
best_estimator= grid.best_estimator_
score_dict['sgd']=grid.best_score_

In [20]:
test=best_estimator['vect'].transform(iyrtitl_set)
test=best_estimator['tfidf'].transform(test)
df['sgd_drake']=best_estimator['clf'].predict_proba(test)[:,0]
df['sgd_quen']=best_estimator['clf'].predict_proba(test)[:,1]
# best_evaluator['clf'].predict(test)

In [21]:
cdf = pd.DataFrame(best_estimator['clf'].coef_.T, 
                   best_estimator['vect'].get_feature_names(), 
                   columns=['Coefficients']).sort_values(['Coefficients'])
drake_tokens.append(cdf.index.tolist()[:15])
total_tokens.append(cdf.index.tolist()[:15])
total_tokens.append(cdf.index.tolist()[-15:])
quentin_tokens.append(cdf.index.tolist()[-15:])

## Support Vector Classification

In [22]:
pipeline = Pipeline(steps=[
    ('vect', CV(analyzer=stemming, stop_words=stops, max_df=0.4)),
    # every iteration has yielded tfidf-l2 as the best for prediction. 
    ('tfidf', TfidfTransformer(use_idf=True, norm='l1')),
    ('clf', SVC(kernel='linear', class_weight='balanced', probability=True))
])

params = {    
#     'vect__min_df': (3, 5, 7), 
    'vect__ngram_range': ((1, 1), (1, 2)),
#     'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1', 'l2'),
#     'clf__kernel' : ('linear', 'poly', 'rbf'),
    'clf__C' : (5, 10, 20),
}

In [23]:
grid=grid_search(pipeline, params, 3)


Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters: {'vect__ngram_range': ((1, 1), (1, 2)), 'clf__C': (5, 10, 20)}
Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  18 out of  18 | elapsed:   13.8s finished


done in 16.156s
Best score: 0.843
Best parameters set:
	clf__C: 10
	vect__ngram_range: (1, 1)


In [24]:
score_dict['svc']=grid.best_score_
best_estimator= grid.best_estimator_

In [25]:
test=best_estimator['vect'].transform(iyrtitl_set)
test=best_estimator['tfidf'].transform(test)
df['svc_drake']=best_estimator['clf'].predict_proba(test)[:,0]
df['svc_quen']=best_estimator['clf'].predict_proba(test)[:,1]


In [26]:
cdf = pd.DataFrame(best_estimator['clf'].coef_.T, 
                   best_estimator['vect'].get_feature_names(), 
                   columns=['Coefficients']).sort_values(['Coefficients'])
drake_tokens.append(cdf.index.tolist()[:15])
total_tokens.append(cdf.index.tolist()[:15])
total_tokens.append(cdf.index.tolist()[-15:])
quentin_tokens.append(cdf.index.tolist()[-15:])

## Random forest 

In [27]:
pipeline = Pipeline(steps=[
    ('vect', CV(analyzer=stemming, stop_words=stops, max_df=0.4)),
    ('tfidf', TfidfTransformer(use_idf=True, norm='l1')),
    ('clf', RandomForestClassifier(class_weight='balanced')),
])

params= {
#     'vect__max_df': (.4, .5), 
    'vect__min_df': (3, 5, 7), 
    'vect__ngram_range': ((1, 1), (1, 2)),
# #     'vect__stop_words': (stops, stops2),
#     'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1', 'l2'),
#     'clf__n_estimators' : (50, 100, 125) ,
    'clf__max_depth' : (30, 50, 100),
#     'clf__max_leaf_nodes' : (5, 25, 50),
}

In [28]:
grid=grid_search(pipeline, params, 3)


Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters: {'vect__min_df': (3, 5, 7), 'vect__ngram_range': ((1, 1), (1, 2)), 'clf__max_depth': (30, 50, 100)}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   33.4s
[Parallel(n_jobs=4)]: Done  54 out of  54 | elapsed:   40.3s finished


done in 42.064s
Best score: 0.817
Best parameters set:
	clf__max_depth: 30
	vect__min_df: 5
	vect__ngram_range: (1, 1)


In [29]:
score_dict['rdf']=grid.best_score_
best_estimator = grid.best_estimator_

In [30]:
test=best_estimator['vect'].transform(iyrtitl_set)
test=best_estimator['tfidf'].transform(test)
df['rdf_drake']=best_estimator['clf'].predict_proba(test)[:,0]
df['rdf_quen']=best_estimator['clf'].predict_proba(test)[:,1]

In [31]:
cdf = pd.DataFrame(best_estimator['clf'].feature_importances_.T, 
                   best_estimator['vect'].get_feature_names(), 
                   columns=['Coefficients']).sort_values(['Coefficients'])
total_tokens.append(cdf[-30:].index.tolist())


In [32]:
# cdf['Coefficients'][:30].index

In [33]:
# 15 most important features for splitting
total_tokens.append(cdf[-30:].index.tolist())

## AdaBoost

In [34]:
pipeline = Pipeline(steps=[
    ('vect', CV(analyzer=stemming, stop_words=stops, max_df=0.4)),
    ('tfidf', TfidfTransformer(use_idf=True, norm='l1')),
    ('clf', AdaBoostClassifier())
])

params = {    
#     'vect__min_df': (3, 5, 7), 
    'vect__ngram_range': ((1, 1), (1, 2)),
#     'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1', 'l2'),
    'clf__n_estimators' : (80, 100, 120),
    'clf__learning_rate' : (0.1, 0.3, 0.5),
}

In [35]:
grid=grid_search(pipeline, params, 3)


Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters: {'vect__ngram_range': ((1, 1), (1, 2)), 'clf__n_estimators': (80, 100, 120), 'clf__learning_rate': (0.1, 0.3, 0.5)}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   38.7s
[Parallel(n_jobs=4)]: Done  54 out of  54 | elapsed:   48.1s finished


done in 50.250s
Best score: 0.843
Best parameters set:
	clf__learning_rate: 0.1
	clf__n_estimators: 120
	vect__ngram_range: (1, 1)


In [36]:
score_dict['ada']=grid.best_score_
best_estimator= grid.best_estimator_

In [37]:
test=best_estimator['vect'].transform(iyrtitl_set)
test=best_estimator['tfidf'].transform(test)
df['ada_drake']=best_estimator['clf'].predict_proba(test)[:,0]
df['ada_quen']=best_estimator['clf'].predict_proba(test)[:,1]
# best_evaluator['clf'].predict(test)

In [38]:
cdf = pd.DataFrame(best_estimator['clf'].feature_importances_.T, 
                   best_estimator['vect'].get_feature_names(), 
                   columns=['Coefficients']).sort_values(['Coefficients'])
# 30 most important features for splitting
total_tokens.append(cdf[-30:].index.tolist())

## Gradient Boosting

In [39]:
pipeline = Pipeline(steps=[
    ('vect', CV(analyzer=stemming, stop_words=stops, max_df=0.4)),
    # every iteration has yielded tfidf-l2 as the best for prediction. 
    ('tfidf', TfidfTransformer(use_idf=True, norm='l1')),
    ('clf', GradientBoostingClassifier())
])

params = {    
#     'vect__min_df': (3, 5, 7), 
    'vect__ngram_range': ((1, 1), (1, 2)),
#     'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1', 'l2'),
    'clf__learning_rate' : (0.1,0.2, 0.3),
#     'clf__n_estimators' : (50, 75, 100),
#     'clf__min_samples_leaf' : (1,2,5),
    'clf__max_depth' : (2,3,5),
#     'clf__min_impurity_decrease' : (0, 0.01, 0.05),
#     'clf__tol' = (0.0001, 0.001, 0.005),
}

In [40]:
grid=grid_search(pipeline, params, 3)

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters: {'vect__ngram_range': ((1, 1), (1, 2)), 'clf__learning_rate': (0.1, 0.2, 0.3), 'clf__max_depth': (2, 3, 5)}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   44.6s
[Parallel(n_jobs=4)]: Done  54 out of  54 | elapsed:   56.6s finished


done in 58.851s
Best score: 0.840
Best parameters set:
	clf__learning_rate: 0.3
	clf__max_depth: 2
	vect__ngram_range: (1, 2)


In [41]:
score_dict['gdb']=grid.best_score_
best_estimator= grid.best_estimator_
test=best_estimator['vect'].transform(iyrtitl_set)
test=best_estimator['tfidf'].transform(test)
df['gdb_drake']=best_estimator['clf'].predict_proba(test)[:,0]
df['gdb_quen']=best_estimator['clf'].predict_proba(test)[:,1]

In [42]:
cdf = pd.DataFrame(best_estimator['clf'].feature_importances_.T, 
                   best_estimator['vect'].get_feature_names(), 
                   columns=['Coefficients']).sort_values(['Coefficients'])

# 15 most important features for splitting
total_tokens.append(cdf[-30:].index.tolist())

## Nueral Network

In [94]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000)

In [None]:
X_train = tokenizer.texts_to_sequences()

In [77]:
cv=CV(analyzer=stemming, stop_words=stops, min_df = 5, max_df=0.4)
    # every iteration has yielded tfidf-l2 as the best for prediction. 
tf=TfidfTransformer(use_idf=True, norm='l1')
r=tf.fit_transform(cv.fit_transform(corpus))

In [78]:
r.shape[1]

1289

In [87]:
print(r.shape[1])
#get proper x_train shape
model = Sequential([
    Dense(10, input_dim=r.shape[1], activation='relu'),
    Dense(20, activation='relu'),
    Dense(30, activation='relu'),
    Dense(20, activation='relu'),
    Dense(units=1, activation='sigmoid')
])
          
model.summary()

1289
Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_35 (Dense)             (None, 10)                12900     
_________________________________________________________________
dense_36 (Dense)             (None, 20)                220       
_________________________________________________________________
dense_37 (Dense)             (None, 30)                630       
_________________________________________________________________
dense_38 (Dense)             (None, 20)                620       
_________________________________________________________________
dense_39 (Dense)             (None, 1)                 21        
Total params: 14,391
Trainable params: 14,391
Non-trainable params: 0
_________________________________________________________________


In [88]:
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics = ['acc'])

In [89]:
earlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, 
                                              patience=10, verbose=0, mode='auto')

In [93]:
batch_size = 64 
epochs = 500 

num_classes =2

node_laters = [100, 100, 100]
history_basic = model.fit(r, y_train, 
                          callbacks=[earlyStopping], 
                          batch_size=batch_size, 
                          epochs=epochs, 
                          validation_split=.3, 
                          verbose=True)#False


Train on 210 samples, validate on 90 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500


## Results

In [335]:
# baseline accuracy of naive model of predict "Drake" on every song. 
base=d_corp_cnt/len(y_train)
print("base:\t {:.3f}".format(base))
for model in score_dict: 
    print("{}:\t {:.3f}".format(model, score_dict[model]))

base:	 0.750
log:	 0.850
sgd:	 0.847
svc:	 0.843


In [197]:
# psuedo stacking method of prediction on test set. 
drake_cols=['sgd_drake', 'log_drake', 'svc_drake', 'rdf_drake', 'ada_drake', 'gdb_drake']
quen_cols=['sgd_quen', 'log_quen', 'svc_quen', 'rdf_quen', 'ada_quen', 'gdb_quen']

In [238]:
#input dataframe, and first co
# sum the rake probabilities weighted by how much improvement over the the baseline prediction that model offered. 
# standardize the result so the final output is a probability that will sum to one when added with Quentin probability
def stack(df, cols): 
    result=np.zeros(len(iyrtitl))
    for col in cols: 
#         print(df[col]) #, score_dict[col[:3]])
        result += df[col]
#     *score_dict[col[:3]]
#     score_dict[(col[:3])]*
#     accum= 0
#     for el in score_dict: 
#         accum+=score_dict[el]
#     print(accum)
    return(result/len(cols))
#            /accum)

In [239]:
accum= 0
for el in score_dict: 
    accum+=score_dict[el]
accum

5.81

In [244]:
df['stake_drake']=stack(df, drake_cols)

In [245]:
df['stake_quen']=stack(df, quen_cols)

In [336]:
df

Unnamed: 0,credits,log_drake,log_quen,sgd_drake,sgd_quen,svc_drake,svc_quen
Legend,1.0,0.578448,0.421552,0.802223,0.197777,0.952967,0.047033
Energy,0.0,0.510573,0.489427,0.63224,0.36776,0.657906,0.342094
10 Bands,1.0,0.507951,0.492049,0.552988,0.447012,0.617244,0.382756
Know Yourself,1.0,0.524265,0.475735,0.704607,0.295393,0.774711,0.225289
No Tellin’,0.0,0.510937,0.489063,0.626998,0.373002,0.625399,0.374601
Madonna,0.0,0.530454,0.469546,0.634039,0.365961,0.77526,0.22474
6 God,0.0,0.544712,0.455288,0.763759,0.236241,0.896706,0.103294
Star67,0.0,0.536065,0.463935,0.739343,0.260657,0.838215,0.161785
Preach,0.0,0.549942,0.450058,0.74844,0.25156,0.903559,0.096441
Used To,1.0,0.534641,0.465359,0.765621,0.234379,0.829249,0.170751


In [372]:
total_tokens[-1].values()

TypeError: 'numpy.ndarray' object is not callable

In [366]:
import functools
# def app(ls): 
l_drake=functools.reduce(lambda x, y : x+y, drake_tokens, [])
l_quen=functools.reduce(lambda x, y : x+y, quentin_tokens, [])
l_tot=functools.reduce(lambda x, y : x+y, total_tokens, [])

ValueError: operands could not be broadcast together with shapes (90,) (30,) 

In [362]:
Counter(l_drake)

Counter({'oh': 3,
         're': 3,
         'are': 3,
         'night': 3,
         'miss': 2,
         'ever': 3,
         'dog': 2,
         'wishin': 2,
         'whi': 2,
         'littl': 1,
         'onli': 2,
         'firm': 1,
         'wanna': 2,
         'babi': 3,
         'drake': 3,
         've': 1,
         'other': 2,
         'stori': 1,
         'bounc': 1,
         'where': 1,
         'rememb': 1,
         'uh': 1,
         'who': 1,
         'nah': 1})

In [363]:
Counter(l_quen)

Counter({'real': 1,
         'away': 1,
         'forreal': 2,
         'live': 3,
         'wait': 3,
         'bitch': 2,
         'control': 2,
         'might': 2,
         'case': 3,
         'came': 2,
         'gon': 3,
         'woah': 2,
         'whoa': 3,
         'nike': 3,
         'yuh': 3,
         'em': 1,
         'hit': 1,
         'sir': 1,
         'lit': 2,
         'quentin': 1,
         'rap': 1,
         'miller': 1,
         'beat': 1,
         'deep': 1})

In [364]:
Counter(l_tot)

Counter({'real': 1,
         'away': 1,
         'forreal': 2,
         'live': 3,
         'wait': 3,
         'bitch': 2,
         'control': 2,
         'might': 2,
         'case': 3,
         'came': 2,
         'gon': 3,
         'woah': 2,
         'whoa': 3,
         'nike': 3,
         'yuh': 3,
         'em': 1,
         'hit': 1,
         'sir': 1,
         'lit': 2,
         'quentin': 1,
         'rap': 1,
         'miller': 1,
         'beat': 1,
         'deep': 1})

In [365]:
total_tokens

[['oh',
  're',
  'are',
  'night',
  'miss',
  'ever',
  'dog',
  'wishin',
  'whi',
  'littl',
  'onli',
  'firm',
  'wanna',
  'babi',
  'drake'],
 ['real',
  'away',
  'forreal',
  'live',
  'wait',
  'bitch',
  'control',
  'might',
  'case',
  'came',
  'gon',
  'woah',
  'whoa',
  'nike',
  'yuh'],
 ['re',
  'are',
  'oh',
  'drake',
  've',
  'night',
  'ever',
  'babi',
  'other',
  'stori',
  'dog',
  'bounc',
  'where',
  'rememb',
  'uh'],
 ['em',
  'hit',
  'wait',
  'sir',
  'gon',
  'whoa',
  'case',
  'lit',
  'quentin',
  'rap',
  'miller',
  'beat',
  'nike',
  'live',
  'yuh'],
 ['re',
  'oh',
  'ever',
  'wishin',
  'are',
  'night',
  'miss',
  'whi',
  'babi',
  'drake',
  'wanna',
  'other',
  'who',
  'onli',
  'nah'],
 ['deep',
  'lit',
  'forreal',
  'bitch',
  'live',
  'came',
  'wait',
  'control',
  'gon',
  'might',
  'case',
  'woah',
  'whoa',
  'nike',
  'yuh'],
 Index(['ayi', 'oh', 'where', 'let', 'swear', 'who', 'daughter', 'alway',
        'give', '