In [31]:
import numpy as np
from numpy import linalg
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import pandas as pd
import numpy as np
import utils
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt

In [32]:
def split_data(df,label):
    '''
    utilitary function used to generate data splits for the first two parts of the lab
    '''
    try:
    # Doesn't work: a value is missing
        train_data, test_data = train_test_split(df, test_size = 0.2, 
                                                 stratify=df[label])
    except:
        # Count the missing lines and drop them
        missing_rows = np.isnan(df[label])
        #print("Uh oh, {} lines missing data! Dropping them".format(np.sum(missing_rows)))
        df = df.dropna(subset=[label])
        train_data, test_data = train_test_split(df, test_size = 0.2, 
                                                 stratify=df[label])
        
    return train_data, test_data



def fit_model(X, y):
    '''
    Least squares solution of a linear model of the form y = W^Tx
    returns the estimated weights vector
    '''
    X_t = np.transpose(X) #X^T
    X_t_X = X_t.dot(X)    #X^TX
    X_T_y = X_t.dot(y)    #X^Ty
    
    #An alternative and more efficient way to compute: using a linear solver to solve the eq Ax = b
    w = linalg.solve(X_t_X, X_T_y)
    return w


def fit_logreg(X, y):
    '''
    Wraps initialization and training of Logistic regression
    '''
    logreg = LogisticRegression(C=1e20, solver='liblinear', max_iter=200) #
    logreg.fit(X, y)
    
    return logreg

def comparing_plots(xx,yy, X, y, data_1, data_2, title_1, title_2):
    '''
    utilitary function to plot results from two methods side by side. 
    It displays the training data with different colours and uses the same colours to differentiate 
    the different regions defined by the decision boundaries.
    '''
    cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
    cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

    plt.rcParams['figure.figsize'] = [20, 10]
    plt.subplot(121)
    plt.pcolormesh(xx, yy, data_1, cmap=cmap_light)

    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
                edgecolor='k', s=20)
    
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title(title_1)
    
    plt.subplot(122)

    plt.pcolormesh(xx, yy, data_2, cmap=cmap_light)
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
                edgecolor='k', s=20)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title(title_2)
    plt.show()
    
def gaussians():
    '''
    Generates data from a multivariate Gaussian distribution.
    Means, covariances and number of samples are fixed.
    '''
    N=50
    means = np.array([[4.5, 4.5],
                      [5.5, 2.5],
                      [6.3,3.5]])
    covs = np.array([np.diag([0.5, 0.5]),
                     np.diag([0.5, 0.5]),
                     np.diag([0.5, 0.5])])
    y=[]
    points = []
    for i in range(len(means)):
        x = np.random.multivariate_normal(means[i], covs[i], N )
        points.append(x)
        y.append(i*np.ones(N)) 
    points = np.concatenate(points)
    y=np.concatenate(y)
    
    return points, y

In [33]:
genres_df = pd.read_csv('../MusicGenreClassification/Music_data_set_2genres.csv')

train = pd.read_csv('../MusicGenreClassification/Music_data_set_2genres_training.csv')
test = pd.read_csv('../MusicGenreClassification/Music_data_set_2genres_testing.csv')



print(genres_df.dtypes)     # Prints out the data types associated to each of the fields in the table
genres_df.head()            # df.head(N) displays the top N entries of a dataframe. If no arguments shows 5 by default.

Unnamed: 0            int64
song                 object
mean_mfccs          float64
mean_chroma_stft    float64
tempo               float64
pulse               float64
flatness            float64
contrast            float64
zero_crossing         int64
genre                object
genre_binary          int64
dtype: object


Unnamed: 0.1,Unnamed: 0,song,mean_mfccs,mean_chroma_stft,tempo,pulse,flatness,contrast,zero_crossing,genre,genre_binary
0,0,blues.00000.wav,1.938016,0.350088,123.046875,0.274049,0.004498,20.526699,55031,blues,0
1,1,blues.00001.wav,-0.055611,0.340914,107.666016,0.268452,0.002298,20.676128,37139,blues,0
2,2,blues.00002.wav,1.034164,0.363637,161.499023,0.26426,0.002631,22.197517,50563,blues,0
3,3,blues.00003.wav,0.071344,0.404785,172.265625,0.259775,0.000954,21.426268,22077,blues,0
4,4,blues.00004.wav,-8.458839,0.308526,135.999178,0.213662,0.003238,21.466338,67225,blues,0


In [34]:
y_train = train['genre_binary']
y_test = test['genre_binary']

print(y_test)

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    1
21    1
22    1
23    1
24    1
25    1
26    1
27    1
28    1
29    1
30    1
31    1
32    1
33    1
34    1
35    1
36    1
37    1
38    1
39    1
Name: genre_binary, dtype: int64


In [35]:

def prepare_X(ds):
    '''
    Prepares training data by selecting the features to use and the adding the dummy variable x[0] = 1
    '''
    X_cols = ds[['mean_mfccs' , 
                'mean_chroma_stft' , 
                'tempo' , 
                 'pulse' , 
                 'flatness' , 
                 'contrast' ,
                 'zero_crossing']].copy()
    X = X_cols.values
    X = X.reshape(len(X_cols),-1)
    
    #We add the dummy x_0
    poly = PolynomialFeatures(1)  
    X = poly.fit_transform(X)

    return X

In [36]:
#Call prepare_X to prepare training data. We do the same for test data, although this will only be used later on.
X = prepare_X(train)
X_test = prepare_X(test)

In [37]:
W = utils.fit_model(X,y_train.values)

In [38]:
def run(W, X_test, y_test):
    '''
    Collects the necessary steps to predict and then evaluate a particular model. 
    As inputs it receives the model parameters an input dataset X_test and the corresponding targets y_test
    It returs the MSE.
    '''
    y_hat_test = predict(W,X_test)
    mse = MSE(y_test, y_hat_test)
    
    return mse

def predict(W, X):
    return np.dot(X,W)
    
def MSE(y, y_hat):
    N = len(y)
    sum = 0
    for i in range(0, N - 1):
        sum = sum + (y[i] - y_hat[i]) ** 2
    return sum / N
 
run(W, X_test, y_test)

0.3155407070546966

In [39]:
mse = run(W,X,y_train)

print('*******************************************************************************************')
print('[W_0,W] : [', W[0],',', W[1:], ']' )
print('MSE: ', mse)
print('*******************************************************************************************')

*******************************************************************************************
[W_0,W] : [ 2.3428113875936765 , [-2.85538703e-02  3.24993804e+00 -1.15938344e-03 -2.72158296e+00
  1.95085737e+00 -1.06658586e-01 -4.47267836e-07] ]
MSE:  0.13664820897910773
*******************************************************************************************
