In [180]:
import pandas as pd
import numpy as np

In [181]:
data = pd.read_csv('Data.csv')

In [159]:
data.shape

(10, 4)

In [160]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [182]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,GErmany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [183]:
X = data.drop(['Purchased'], axis = 1)
y = data['Purchased']
X.iloc[8,0] = 'Germany'

# TAKE CARE OF NULL VALUES (USE IMPUTE and COLUMN TRANSFORMER)

In [185]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [186]:
imputer = SimpleImputer(missing_values = np.NaN , strategy = 'mean') #takes arguments 'missing_values' to specify type of missing values
                                                                    # and 'strategy' to specify what to impute

In [187]:
ct = ColumnTransformer([('imputer',imputer,[1,2])], remainder = 'passthrough') #takes arugument 1 is a list of tuples consisting of name of transformation, transformer and columns to be transformed
                                                                                #and specify what to do with the remaining columns 

In [188]:
X = pd.DataFrame(ct.fit_transform(X), columns = ['Age', 'Salary', 'Country'])

In [189]:
X

Unnamed: 0,Age,Salary,Country
0,44.0,72000.0,France
1,27.0,48000.0,Spain
2,30.0,54000.0,Germany
3,38.0,61000.0,Spain
4,40.0,63777.8,Germany
5,35.0,58000.0,France
6,38.7778,52000.0,Spain
7,48.0,79000.0,France
8,50.0,83000.0,Germany
9,37.0,67000.0,France


# Second way to deal with the missing values

In [121]:
X2 = data.drop(['Purchased'], axis = 1)
y2 = data['Purchased']

In [122]:
from sklearn.impute import SimpleImputer

In [125]:
imputer2 = SimpleImputer(missing_values = np.NaN, strategy = 'mean')

In [136]:
X2.loc[:,['Age','Salary']] = imputer2.fit_transform(X2.loc[:,['Age','Salary']])

In [140]:
X2.iloc[8,0] = 'Germany'

# encode the feature country so that it can be fed into a model

In [190]:
#country is an unordered set of elements
#use the one-hot-encoder for nominal encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ohe = OneHotEncoder()

In [171]:
ct_2 = ColumnTransformer([('ohe',ohe,[2])], remainder = 'passthrough')

In [192]:
X = ct_2.fit_transform(X)

# encode the dependant variable into numerical values

In [196]:
from sklearn.preprocessing import LabelEncoder

In [197]:
lblEncoder = LabelEncoder()

In [198]:
y = lblEncoder.fit_transform(y)

# split data into test and train

In [236]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42, test_size = 0.2)

In [237]:
X_train

array([[1.0, 0.0, 0.0, 35.0, 58000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0]], dtype=object)

# model the data and fit

In [228]:
from sklearn.linear_model import LogisticRegression

In [227]:
lr = LogisticRegression(random_state = 42)

In [225]:
lr.fit(X_train, y_train)
lr_score = lr.score(X_test,y_test)
lr_score

0.5