In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string

from sklearn.feature_extraction.text import CountVectorizer
import spacy
from nltk.corpus import stopwords

In [6]:
df = pd.read_csv('spotify_netflix.csv')

In [7]:
df = df[df['text_length'] > 10]

## Modeling

In [8]:
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

###########################################################################################################################

### Using Countvectorizer

In [9]:
#Baseline 
df.is_spotify.value_counts(normalize=True)

1    0.500958
0    0.499042
Name: is_spotify, dtype: float64

In [10]:
X = df['text']
y = df['is_spotify']

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words = 'english')

In [12]:
X = cv.fit_transform(X)

In [13]:
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

In [15]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [16]:
nb.fit(X_train,y_train)

MultinomialNB()

In [17]:
predictions = nb.predict(X_test)

In [18]:
from sklearn.metrics import confusion_matrix, classification_report

In [19]:
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[2760  132]
 [ 140 2764]]
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      2892
           1       0.95      0.95      0.95      2904

    accuracy                           0.95      5796
   macro avg       0.95      0.95      0.95      5796
weighted avg       0.95      0.95      0.95      5796



### Using TF-IDF and Bayes

In [20]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

In [21]:
pipe1 = Pipeline([('bow', CountVectorizer(stop_words= 'english')),
                ('tfidf',TfidfTransformer()),
                ('model',MultinomialNB())])

In [22]:
X = df['text']
y = df['is_spotify']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

In [23]:
pipe1.fit(X_train, y_train)

Pipeline(steps=[('bow', CountVectorizer(stop_words='english')),
                ('tfidf', TfidfTransformer()), ('model', MultinomialNB())])

In [25]:
pipe1.score(X_train,y_train)

0.975079494195075

In [26]:
pipe1.score(X_test,y_test)

0.9508281573498965

In [27]:
predictions = pipe1.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[2778  114]
 [ 171 2733]]
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      2892
           1       0.96      0.94      0.95      2904

    accuracy                           0.95      5796
   macro avg       0.95      0.95      0.95      5796
weighted avg       0.95      0.95      0.95      5796



### KNN

In [28]:
pipe2 = Pipeline([('bow', CountVectorizer(stop_words= 'english')),
                ('tfidf',TfidfTransformer()),
                ('model',KNeighborsClassifier())])

In [29]:
X = df['text']
y = df['is_spotify']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

In [30]:
pipe2.fit(X_train, y_train)

Pipeline(steps=[('bow', CountVectorizer(stop_words='english')),
                ('tfidf', TfidfTransformer()),
                ('model', KNeighborsClassifier())])

In [31]:
pipe2.score(X_train,y_train)

0.7537528654884271

In [32]:
pipe2.score(X_test,y_test)

0.6752933057280883

In [33]:
predictions = pipe2.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[2716  176]
 [1706 1198]]
              precision    recall  f1-score   support

           0       0.61      0.94      0.74      2892
           1       0.87      0.41      0.56      2904

    accuracy                           0.68      5796
   macro avg       0.74      0.68      0.65      5796
weighted avg       0.74      0.68      0.65      5796



### RandomForestClassifier

In [34]:
pipe3 = Pipeline([('bow', CountVectorizer(stop_words= 'english')),
                ('tfidf',TfidfTransformer()),
                ('model',RandomForestClassifier())])

In [35]:
X = df['text']
y = df['is_spotify']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

In [36]:
pipe3.fit(X_train, y_train)

Pipeline(steps=[('bow', CountVectorizer(stop_words='english')),
                ('tfidf', TfidfTransformer()),
                ('model', RandomForestClassifier())])

In [37]:
pipe3.score(X_train,y_train)

0.9990386748502551

In [38]:
pipe3.score(X_test,y_test)

0.9501380262249828

In [39]:
predictions = pipe3.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[2812   80]
 [ 209 2695]]
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      2892
           1       0.97      0.93      0.95      2904

    accuracy                           0.95      5796
   macro avg       0.95      0.95      0.95      5796
weighted avg       0.95      0.95      0.95      5796



########################################################################################################################