In [None]:
# Scikit-Learn provides all these common algorithms like decision trees, nerual networks and so on.
# Numpy provides a multidimensional array
# Pandas is a data analysis library that provides a concept called data frame.
# Data frame is a two dimensional data strucutre similar to excel spreadsheet.
# Matplotlib is a two dimensional plotting library for creating graphs on plots.

# Importing a Data Set


In [3]:
import pandas as pd
df = pd.read_csv('vgsales.csv')
df.shape

(16598, 11)

In [4]:
df.describe()

Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,16598.0,16327.0,16598.0,16598.0,16598.0,16598.0,16598.0
mean,8300.605254,2006.406443,0.264667,0.146652,0.077782,0.048063,0.537441
std,4791.853933,5.828981,0.816683,0.505351,0.309291,0.188588,1.555028
min,1.0,1980.0,0.0,0.0,0.0,0.0,0.01
25%,4151.25,2003.0,0.0,0.0,0.0,0.0,0.06
50%,8300.5,2007.0,0.08,0.02,0.0,0.01,0.17
75%,12449.75,2010.0,0.24,0.11,0.04,0.04,0.47
max,16600.0,2020.0,41.49,29.02,10.22,10.57,82.74


In [7]:
df.values

array([[1, 'Wii Sports', 'Wii', ..., 3.77, 8.46, 82.74],
       [2, 'Super Mario Bros.', 'NES', ..., 6.81, 0.77, 40.24],
       [3, 'Mario Kart Wii', 'Wii', ..., 3.79, 3.31, 35.82],
       ...,
       [16598, 'SCORE International Baja 1000: The Official Game', 'PS2',
        ..., 0.0, 0.0, 0.01],
       [16599, 'Know How 2', 'DS', ..., 0.0, 0.0, 0.01],
       [16600, 'Spirits & Spells', 'GBA', ..., 0.0, 0.0, 0.01]],
      dtype=object)

# A Real Problem

In [8]:
# Imagine we have an online music store, when our users sign up, we asked for age and gender
# and based on their profile, we recommend various music albums they are likely to buy.
# So in this project, we want to use machine learning to increase sales.
# So we want to build a model, we feed this model with some sample data based on the existing
# users. Our model will learn the patterns in our data, so we can ask it to make predictions.
# When a new user signs up, our model will make predictions for what kind of music the new user 
# is interested in.


In [None]:
# 1. Import the Data
# 2. Clean the Data
# 3. Split the Data into Training/Test Sets
# 4. Create a Model
# 5. Train the Model
# 6. Make Predictions
# 7. Evaluate and Improve

#  Import the Data

In [12]:
import pandas as pd
music_data = pd.read_csv('music.csv')
music_data

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


# Preparing the Data

In [13]:
# Involves task such as removing duplicates, null values and so on. 
# In this project, we should split this dataset into two separate datasets, one into the first
# two column, which we refer to as the input set, and the other with last culumn which we refer to
# as the output set. So when we train a model we give it two separate data sets. The input set and
# the output set. The input set which is in this case the gen column contains the predictions.
# So we are telling our model that if we have a user who's 20 years old and a male they like hiphop.
# Once we train our model then we give it a new input set.

In [46]:
# This data frame object has a method called drop.
# Set specific columns we want to drop
x = music_data.drop(columns=['genre'])
y = music_data['genre']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)


# Learning and Predicting


In [47]:
# Here we will use decision tree, we can import the algorithm from the library called scilit-learn.
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

model = DecisionTreeClassifier()
model.fit(x_train,y_train)
predictions = model.predict(x_test)
score = accuracy_score(y_test,predictions)
predictions
score
# When we change the test_size=0.8,
# The score percentage will change because we are using little data for training this model.
# This is one of key concepts in machine learning, the more data we give to our model, and clearner
# the data is, we get the better result, so if you have duplicates,irrelevant data, or incomplete
# values, our model will learn back patterns in our data. That's why it's really important to clean
# our data before training our model. 

1.0

# Calculating the Accuracy

In [44]:
# We need to separate our dataset into two parts. One for training, and the other two for testing.
# A general rule for calculating the accuracy is to annotate 70-80% of our data to training, and
# another 20-30% for testing, then instead of passing only two samples for testing, then instead of
# passing only two samples for making predictions, we can pass the dataset for testing. We'll get the
# predictions and then compare the predictions with the actual values in the test set.

from sklearn.model_selection import train_test_split


# Model Persistance

In [55]:
# In real applications, we may have hundreds or thousands of examples, training a model might take 
# even hours. So that's why model persistance is important, once in a while, we build and train our 
# model and then we save it to a file. Next time we want to make predictions, we simply load the model from
# the file and ask it make predictions. The model is already trained, you don't need to retrain it, it
# is like an intelligent person.
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import joblib

music_data = pd.read_csv('music.csv')
x = music_data.drop(columns=['genre'])
y = music_data['genre']

model = DecisionTreeClassifier()
model.fit(x,y)
joblib.dump(model,'music-recommender.joblib')
#predictions = model.predict([[21,1]])

['music-recommender.joblib']

In [57]:
model = joblib.load('music-recommender.joblib')
predictions = model.predict([[21,1]])
predictions



array(['HipHop'], dtype=object)

#  Visualizing a Decision Tree

In [60]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree


music_data = pd.read_csv('music.csv')
x = music_data.drop(columns=['genre'])
y = music_data['genre']

model = DecisionTreeClassifier() 
model.fit(x,y)

tree.export_graphviz(model,out_file='music-recommender.dot',
                    feature_names=['age','gender'],
                    class_names=sorted(y.unique()),
                     label='all',
                     rounded=True,
                     filled=True
                    )