# Data Preprocessing

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

## Importing the dataset

In [None]:
data_set = pd.read_csv('data.csv')
features = data_set.iloc[:, :-1].values
dependent_vector = data_set.iloc[:, -1].values

In [None]:
print(features)

In [None]:
print(dependent_vector)

## Taking care of missing data

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(features[:, 1:3])
features[:, 1:3] = imputer.transform(features[:, 1:3])

In [None]:
print(features)

## Encoding categorical data

### Encoding the Independent Variable

In [None]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
features = np.array(ct.fit_transform(features))

In [None]:
print(features)

### Encoding the Dependent Variable

In [None]:
le = LabelEncoder()
dependent_vector = le.fit_transform(dependent_vector)

In [None]:
print(dependent_vector)

## Splitting the dataset into the Training set and Test set

In [None]:
feature_train, feature_test, dependent_train, dependent_test = train_test_split(features, dependent_vector, test_size=0.2, random_state=0)

In [None]:
print(feature_train)

In [None]:
print(feature_test)

In [None]:
print(dependent_train)

In [None]:
print(dependent_test)

## Feature Scaling

In [None]:
sc = StandardScaler()
feature_train[:, 3:] = sc.fit_transform(feature_train[:, 3:])
feature_test[:, 3:] = sc.transform(feature_test[:, 3:])

In [None]:
print(feature_train)

In [None]:
print(feature_test)