In [103]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [104]:
import warnings
warnings.filterwarnings("ignore")

In [105]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

## Data

In [1]:
path = "/Users/jiangruiyin/Desktop/proj/data_cleaned(1).csv"

In [106]:
data = pd.read_csv(path, header=None, names=['lyrics', 'label'])

In [107]:
data

Unnamed: 0,lyrics,label
0,touch me turn me on and burn me down your li...,0
1,well i dont give a damn that your cars on fire...,0
2,i saw you starin at each other i saw your eyes...,0
3,your cruel device your blood like ice one look...,0
4,stretch the bones over my skin stretch the ski...,0
...,...,...
2459,haces muy mal en elevar mi tensión en aplastar...,3
2460,cierro los ojos sin mirar atrás las cosas buen...,3
2461,una palomita blanca de piquito colorado ayer y...,3
2462,sólo una palabra se hubiera llevado el dolor c...,3


In [108]:
labels = data.iloc[:,1]
lyrics = data.iloc[:,0]

In [109]:
labels[2462]

3

In [110]:
lyrics[2462]

'sólo una palabra se hubiera llevado el dolor con el beso amargo de aquel licor hubiera bastado mi amor sólo una mentira se viene conmigo a pasear sentirme querida en aquel abrazo en el mar  con el vestido azul que un día conociste me marcho sin saber si me besaste antes de irte te di mi corazón y tú lo regalaste te di todo el amor que pude darte y me robaste he rasgado mi vestido con una copa de vino hoy tu amor corta como el cristal  en el cielo hay playas donde ves la vida pasar donde los recuerdos no hacen llorar vienen muy despacio y se van sólo una caricia me hubiera ayudado a olvidar que no eran mis labios los que ahora te hacen soñar  con el vestido azul que un día conociste me marcho sin saber si me besaste antes de irte te di mi corazón y tú lo regalaste te di todo el amor que pude darte y me robaste he rasgado mi vestido con una copa de vino hoy tu amor corta como el cristal buena suerte en tu camino yo ya tengo mi destino con mi sangre escribo este final'

In [111]:
lyrics

0       touch me turn me on and burn me down   your li...
1       well i dont give a damn that your cars on fire...
2       i saw you starin at each other i saw your eyes...
3       your cruel device your blood like ice one look...
4       stretch the bones over my skin stretch the ski...
                              ...                        
2459    haces muy mal en elevar mi tensión en aplastar...
2460    cierro los ojos sin mirar atrás las cosas buen...
2461    una palomita blanca de piquito colorado ayer y...
2462    sólo una palabra se hubiera llevado el dolor c...
2463    lover please please come back dont take a trai...
Name: lyrics, Length: 2464, dtype: object

In [112]:
X_train, X_test, y_train, y_test = train_test_split(lyrics, labels, test_size=0.2, random_state=42)

In [113]:
X_train

1375    listen to the girl as she takes on half the wo...
903     have yourself a merry little christmas let you...
1939    the show is over partys just begun im high ton...
1325    one day well walk in the sun  my name is verno...
1995    mercy dont think shes pretty no more starin at...
                              ...                        
1638    ive seen god in the sun ive seen god in the st...
1095    i dont want another heartbreak i dont need ano...
1130    id gladly walk across the desert with no shoes...
1294    our god is an awesome god he reigns from heave...
860     i dont want you anymore cause you took my joy ...
Name: lyrics, Length: 1971, dtype: object

## TfidfVectorizer

In [126]:
from nltk.corpus import stopwords
# stopwords
stop_words_EN = stopwords.words('english')
stop_words_SP = stopwords.words('spanish')
stop_words = stop_words_EN + stop_words_SP
# TF-IDF
tf_transfomer = TfidfVectorizer(stop_words=stop_words, decode_error='ignore')
tf_transfomer.fit(lyrics)

TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

In [128]:
X_train_counts_tf = tf_transfomer.transform(X_train)
X_test_counts_tf = tf_transfomer.transform(X_test)

In [129]:
X_train_counts_tf

<1971x14817 sparse matrix of type '<class 'numpy.float64'>'
	with 90867 stored elements in Compressed Sparse Row format>

In [130]:
X_test_counts_tf

<493x14817 sparse matrix of type '<class 'numpy.float64'>'
	with 22506 stored elements in Compressed Sparse Row format>

## Paras Tuning of base models

In [131]:
num_folds = 10
seed = 42
scoring = 'accuracy'

In [132]:
#Naive Bayes
param_grid = {}
param_grid['alpha'] = [0.001,0.01,0.05,0.1,0.5,1,5]
model = MultinomialNB()
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = scoring, cv = kfold)
grid_result = grid.fit(X = X_train_counts_tf, y = y_train)
print('Best accuracy: %s ; alpha: %s' % (grid_result.best_score_, grid_result.best_params_))

Best accuracy: 0.7843562528841717 ; alpha: {'alpha': 0.05}


In [133]:
#Logistic Regression
param_grid = {}
param_grid['C'] = [0.1,5,13,15]
model = LogisticRegression()
kfold = KFold(n_splits = num_folds, random_state = seed)
grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = scoring, cv = kfold)
grid_result = grid.fit(X = X_train_counts_tf, y = y_train)
print('Best accuracy: %s ; C: %s' % (grid_result.best_score_, grid_result.best_params_))

Best accuracy: 0.9254037840332255 ; C: {'C': 15}


In [134]:
# SVC
param_grid = {}
param_grid['C'] = [1,10,20]
param_grid['kernel'] = ['rbf','linear']
model = svm.SVC(gamma = 'auto')
kfold = KFold(n_splits = num_folds,random_state = seed)
grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = scoring, cv = kfold)
grid_result = grid.fit(X = X_train_counts_tf, y = y_train)
print('Best accuracy: %s ; C: %s' % (grid_result.best_score_, grid_result.best_params_))

Best accuracy: 0.9345357124544942 ; C: {'C': 10, 'kernel': 'linear'}


## Ensemble Models

In [135]:
#AdaBoost
clf=AdaBoostClassifier(base_estimator = LogisticRegression(C=15), algorithm='SAMME', n_estimators=300, learning_rate=0.8)
clf.fit(X_train_counts_tf, y_train).score(X_test_counts_tf, y_test)

0.8985801217038539

In [136]:
base_learners = [
                 ('clf_1', LogisticRegression(C = 15)),
                 ('clf_2', svm.SVC(gamma = 'auto',C = 10, kernel = 'linear' )) ]

In [138]:
#Stack
stack = StackingClassifier(estimators = base_learners, final_estimator = LogisticRegression())
stack.fit(X_train_counts_tf, y_train).score(X_test_counts_tf, y_test)

0.9249492900608519

In [139]:
#Stack
stack_ = StackingClassifier(estimators = base_learners, final_estimator = svm.SVC())
stack_.fit(X_train_counts_tf, y_train).score(X_test_counts_tf, y_test)

0.9310344827586207