In [456]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier  # sklearn.tree module contains DecisionTreeClassifier class
from sklearn.model_selection import train_test_split  # train_test_split is a function in the sklearn.model_selection module
from sklearn.metrics import accuracy_score  # accuracy_score() returns the accruacy score after comparing 2 lists/arrays


# "sklearn" is the PACKAGE which comes with scikit-learn library.. scikit library is the most popular ML library
# in "sklearn" package.. we have a MODULE "tree".. and in module "tree".. we haev a CLASS DecisionTreeClassifier
# DecisionTreeClassifier implements the Decision Tree algorithm


df = pd.read_csv('D:\Downloads Chrome\music.csv')
df

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


In [457]:
print(df.genre)

0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object


In [458]:
print(df.columns)

Index(['age', 'gender', 'genre'], dtype='object')


In [459]:
X= df.drop(columns=['genre'])  # By convention, 'X' represents our INPUT set
X

Unnamed: 0,age,gender
0,20,1
1,23,1
2,25,1
3,26,1
4,29,1
5,30,1
6,31,1
7,33,1
8,37,1
9,20,0


In [460]:
y = df['genre']  # By convention, 'y' represents our OUTPUT set
print(y)

0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object


In [461]:
y.index

RangeIndex(start=0, stop=18, step=1)

In [462]:
y.describe

<bound method NDFrame.describe of 0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object>

In [463]:
y.values

array(['HipHop', 'HipHop', 'HipHop', 'Jazz', 'Jazz', 'Jazz', 'Classical',
       'Classical', 'Classical', 'Dance', 'Dance', 'Dance', 'Acoustic',
       'Acoustic', 'Acoustic', 'Classical', 'Classical', 'Classical'],
      dtype=object)

In [464]:
# Now that we have our INPUT SET & OUTPUT SET
# We need to create our MODEL using a ML ALGORITHM

# Here, we will use DECISION TREE algorithm.. using SciKit library

In [465]:
# Create a new INSTANCE of the DecisionTreeClassifier
# model = DecisionTreeClassifier() 

# Now train the model to learn patterns
# model.fit(X, y)  # fit() recognizes PATTERN in the INPUT set X & the OUTPUT set Y



In [466]:
# Now we need to make our model predict the output after providing a sample input
# The sample input would be hitherto unknown i.e not available so far
# The predict() would return an OUTPUT based on the training sets alread available to the model

# model.predict([[21,0], [23,1]])

In [467]:
# We need to constantly MEASURE/CALCULATE the accuracy of our model
# If our model is not accurate enough, we need to fine-tune our model or change our algorithm

# We need to DIVIDE our available data-sets into two parts.. one for training & another for testing
# 70-80% of the available data-set should be used for training.. rest for testing
# So after building the model, we can use the testing data-set to do sample predictions of the output
# Then we check the accuracy of the predictions against available testing data-set

In [468]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # test_train_split() takes our INPUT-OUTPUT set
                                      # and divides that into 2 parts
                                      # 1 part for TRAINING.. 2nd part for TESTING
                                      # train_test_split() returns a TUPLE

In [469]:
print(X_train)

    age  gender
4    29       1
3    26       1
14   30       0
12   26       0
15   31       0
5    30       1
9    20       0
11   25       0
7    33       1
6    31       1
1    23       1
17   35       0
16   34       0
2    25       1


In [470]:
print(X_test)

    age  gender
10   21       0
13   27       0
0    20       1
8    37       1


In [471]:
print(y_train)

4          Jazz
3          Jazz
14     Acoustic
12     Acoustic
15    Classical
5          Jazz
9         Dance
11        Dance
7     Classical
6     Classical
1        HipHop
17    Classical
16    Classical
2        HipHop
Name: genre, dtype: object


In [472]:
print(y_test)

10        Dance
13     Acoustic
0        HipHop
8     Classical
Name: genre, dtype: object


In [473]:
model = DecisionTreeClassifier() # creates our model

model.fit(X_train, y_train) # pass the training sets of the input & output

DecisionTreeClassifier()

In [474]:
predictions = model.predict(X_test)  # pass the TESTING part of the INPUT dataset

In [475]:
for ch in predictions:
    print(ch)

Dance
Acoustic
HipHop
Classical


In [476]:
print(accuracy_score(y_test, predictions))

# accuracy_score() returns the accuracy score after comparing two lists/arrays
# normalize is By-Default set to True
# if normalize is set to False, accuracy_score returns the number of CORRECTLY classified samples

1.0
