In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the Dataset

In [2]:
df = pd.read_csv('E:\PAATSHAALA\Assignments\week15\mushrooms.csv')
df.head() 

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
print(df.shape)

(8124, 23)


# Encoding

In [4]:
le = LabelEncoder() 
ds = df.apply(le.fit_transform)
print(type(ds)) 

<class 'pandas.core.frame.DataFrame'>


In [5]:
ds.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [6]:
data = ds.values  
print(type(data))
print(data.shape)

<class 'numpy.ndarray'>
(8124, 23)


In [7]:
print(data[:5,:]) # 1st five rows and all columns

[[1 5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [1 5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


Looking at the class column which is the first column , it could be taken as the target here

In [8]:
 ds['class'].value_counts()

0    4208
1    3916
Name: class, dtype: int64

# Splitting Data

In [9]:
data_x = data[:,1:] #all the rows and all columns starting from column no. 1
data_y = data[:,0]


In [10]:
x_train,x_test,y_train,y_test = train_test_split(data_x,data_y,test_size=0.2)

In [11]:
print(x_train.shape,y_train.shape) #x_train includes all the features for all the 6499 training examples
# y_train includes the labels for the 6499 training examples after train_test_split

(6499, 22) (6499,)


In [12]:
print(x_test.shape,y_test.shape)

(1625, 22) (1625,)


In [13]:
# determining the no. of classes of mushrooms
np.unique(y_train)

array([0, 1])

# Building Classifier

In [14]:
a=np.array([1,1,1,0,0,0,0,0,1])
a==1

array([ True,  True,  True, False, False, False, False, False,  True])

In [15]:
print(sum(a==1)) # This works faster than a for loop

4


In [16]:
def prior_prob(y_train, label):
    total_examples=y_train.shape[0] 
    class_examples=np.sum(y_train==label)
    return (class_examples)/float(total_examples)

In [17]:
y=np.array([1,1,1,0,0,0,0,0,1,1])

prior_prob(y,1) # out of 10 examples half belong to class 1.
#  use of prior_prob

0.5

In [18]:
def cond_prob(x_train,y_train,feature_col,feature_val,label):
    x_filtered = x_train[y_train==label]
    # only those rows from x_train where y_train = label value
    numerator = np.sum(x_filtered[:,feature_col]==feature_val)
    deno = np.sum(y_train==label)
    return numerator/float(deno)

 Determining Posterior probability for each test example and make predictions

In [19]:
np.unique(y_train) # It gives the classes

array([0, 1])

In [20]:
def predict(x_train,y_train,xtest):

    
    # the number of classes
    classes = np.unique(y_train)
    
    #Compute Posterior for each class
    post_probs = [] # List of probs for all classes given a single testing point
    n_features = x_train.shape[1]
    
    for label in classes:
        
        #Post_c = likelihood*prior
        
        likelihood = 1.0
        for f in range(n_features):
            cond = cond_prob(x_train,y_train,f,xtest[f],label)
            likelihood *= cond
            
        prior = prior_prob(y_train,label)
        post = likelihood*prior
        
        post_probs.append(post)
        
    pred = np.argmax(post_probs)
    return pred

    """Xtest is a single testing point having n no. of features"""
    """Before predicting the class, we will calc the posterior prob of each class"""
    """This means given the mushroom, what is the prob that it belongs to class i"""
    """Then we will take max of all the posterior prob and return the index for which the posterior prob is greatest"""
    """That index would denote the class"""

In [21]:
output = predict(x_train,y_train,x_test[1])
print(output)
print(y_test[1])

0
0


In [22]:
def score(x_train,y_train,x_test,y_test):
    pred = [] #array of predictions for all the training examples
    
    for i in range(x_test.shape[0]):
        prediction = predict(x_train,y_train,x_test[i])
        pred.append(prediction)
        
    #converting into numpy arrays for calculating accuracy
    pred = np.array(pred)
    accuracy = np.sum(pred==y_test)/y_test.shape[0] # the number of times prediction matches with y_test
    # divided by the total number of examples
    return accuracy

In [23]:
print(score(x_train,y_train,x_test,y_test))

0.9969230769230769
