In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn import tree
from sklearn.model_selection import (train_test_split, 
                                    cross_val_score)
from sklearn.metrics import precision_score

from IPython.display import (SVG, display, HTML)
from graphviz import Source

In [2]:
dating_data = {
    'predicted_likes': [15, 15],
    'predicted_dislikes': [30, 40],
}
dating_index = ['real_likes', 'real_dislikes']
dating_df = pd.DataFrame(dating_data, index=dating_index)
dating_df.head()

Unnamed: 0,predicted_likes,predicted_dislikes
real_likes,15,30
real_dislikes,15,40


In [3]:
precision = dating_df.predicted_likes.real_likes / (dating_df.predicted_likes.real_likes + dating_df.predicted_likes.real_dislikes)
precision

0.5

In [4]:
recall = dating_df.predicted_likes.real_likes / (dating_df.predicted_likes.real_likes + dating_df.predicted_dislikes.real_likes)
recall

0.3333333333333333

In [5]:
F1 = 2 * dating_df.predicted_likes.real_likes / (2 * dating_df.predicted_likes.real_likes 
                                                 + dating_df.predicted_likes.real_dislikes 
                                                 + dating_df.predicted_dislikes.real_dislikes)
F1

0.35294117647058826

# Самостоятельная работа на песнях

In [6]:
df = pd.read_csv('songs.csv')
df.head()

Unnamed: 0,song,year,artist,genre,lyrics,length,commas,exclamations,colons,hyphens
0,the-ownerz,2007,gangstarr,Hip-Hop,"""One-two..""\n""One-two, shots to the chin.. kno...",1300,24,5,0,11
1,dead-sound-of-misery,2006,blind-guardian,Metal,It's dark\nLet's do it now\nBring in the siren...,1566,0,0,0,0
2,modred-s-song,2006,blind-guardian,Metal,I've lost my battle before it starts\nmy first...,996,1,0,1,0
3,the-ninth-wave,2015,blind-guardian,Metal,As dark night embraces\nWe are the nation\nI w...,2868,5,0,0,0
4,skills,2014,gangstarr,Hip-Hop,"Skills, skills, skills\n""My Microphone""\n""It's...",3351,22,0,0,3


In [7]:
X = df.drop(['song', 'artist', 'genre', 'lyrics'], axis=1)
X.head()

Unnamed: 0,year,length,commas,exclamations,colons,hyphens
0,2007,1300,24,5,0,11
1,2006,1566,0,0,0,0
2,2006,996,1,0,1,0
3,2015,2868,5,0,0,0
4,2014,3351,22,0,0,3


In [8]:
y = df.artist
y.head()

0         gangstarr
1    blind-guardian
2    blind-guardian
3    blind-guardian
4         gangstarr
Name: artist, dtype: object

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
# Подбор подходящей глубины
best_depth = None
precision = 0.0
for depth in range(1, 25):
    clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=depth)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    _precision = precision_score(y_test, predictions, average='micro')
    if _precision > precision:
        precision = _precision
        best_depth = depth
precision, best_depth

(0.8403361344537815, 6)

In [11]:
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=best_depth)
clf.fit(X_train, y_train)

In [12]:
# graph = Source(tree.export_graphviz(clf, feature_names=list(X), class_names=list(set(y)), filled=True))
# display(SVG(graph.pipe(format='svg')))

In [13]:
predictions = clf.predict(X_test)

In [14]:
precision = precision_score(y_test, predictions, average='micro')
precision

0.8403361344537815