# Numpy Indexing and Slicing

In [1]:
# import numpy library
import numpy as np

In [12]:
# create an array
a = np.array([1, 2, 3, 4, 5,6,7])
a

array([1, 2, 3, 4, 5, 6, 7])

In [3]:
a[0] # returns 1 (the first element)

1

In [4]:
a[-1] # returns 5 (the last element) 

5

In [5]:
a[-2] # returns 4 (the second-last element)

4

In [13]:
a[:2] # returns [1, 2] (the first two elements)

array([1, 2])

In [14]:
a[1:] # returns [2, 3, 4, 5] 

array([2, 3, 4, 5, 6, 7])

In [17]:
a[1:5]  # returns [2, 3, 4, 5]  (the second to the fifth element)

array([2, 3, 4, 5])

In [6]:
# create an array by arrange function
b = np.arange(0, 12, 2) # arguments: start, stop, step
b

array([ 0,  2,  4,  6,  8, 10])

In [7]:
b[2] # returns 4 (the third element)

4

In [9]:
c = b.reshape(2, 3) # reshape the array to 2 rows and 3 columns
c

array([[ 0,  2,  4],
       [ 6,  8, 10]])

In [10]:
c[0] # returns the first row

array([0, 2, 4])

In [11]:
c[1]

array([ 6,  8, 10])

In [18]:
c[c<5] # returns elements in c that are less than 5

array([0, 2, 4])

# Enlist boolen operators conditional operators?

----------

# Making an array from a data or anohter array

In [22]:
a = np.arange(0, 222, 5) # arguments: start, stop, step
a

array([  0,   5,  10,  15,  20,  25,  30,  35,  40,  45,  50,  55,  60,
        65,  70,  75,  80,  85,  90,  95, 100, 105, 110, 115, 120, 125,
       130, 135, 140, 145, 150, 155, 160, 165, 170, 175, 180, 185, 190,
       195, 200, 205, 210, 215, 220])

In [23]:
len(a) # returns the length of the array

45

In [24]:
array1 = a[40:45] # returns the elements from the 40th to the 44th
array1

array([200, 205, 210, 215, 220])

In [25]:
a = np.array([1, 2, 3, 4, 5,6,7])
b = np.array([8, 9, 10, 11, 12,13,14])

In [26]:
c = np.vstack((a, b)) # stack arrays vertically (i.e. as rows) 
c

array([[ 1,  2,  3,  4,  5,  6,  7],
       [ 8,  9, 10, 11, 12, 13, 14]])

In [27]:
d = np.hstack((a, b)) # stack arrays horizontally (i.e. as columns)
d

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

----------

# Matrix or Matrices

In [28]:
# create a 2 x 3 matrix
a = np.array([[1, 2, 3], [4, 5, 6]])
a

array([[1, 2, 3],
       [4, 5, 6]])

In [30]:
# create a matrix of big dimensions
b = np.arange(0, 100, 2).reshape(5, 10) # arguments: start, stop, step
b

array([[ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18],
       [20, 22, 24, 26, 28, 30, 32, 34, 36, 38],
       [40, 42, 44, 46, 48, 50, 52, 54, 56, 58],
       [60, 62, 64, 66, 68, 70, 72, 74, 76, 78],
       [80, 82, 84, 86, 88, 90, 92, 94, 96, 98]])

In [32]:
# slicing a matrix
b[1:3, 3:5] # returns the elements from the second to the third row and the fourth to the fifth column

array([[26, 28],
       [46, 48]])

In [33]:
b[0:, 3:5] # returns the elements from all rows and the fourth to the fifth column

array([[ 6,  8],
       [26, 28],
       [46, 48],
       [66, 68],
       [86, 88]])

In [36]:
b[0:3, 3:5] # returns the elements from the (first to the third row(0:3)) and the (fourth to the fifth column(3:5))

array([[ 6,  8],
       [26, 28],
       [46, 48]])

In [37]:
b[:,:] # returns all elements in the matrix b  (i.e. all rows and columns)

array([[ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18],
       [20, 22, 24, 26, 28, 30, 32, 34, 36, 38],
       [40, 42, 44, 46, 48, 50, 52, 54, 56, 58],
       [60, 62, 64, 66, 68, 70, 72, 74, 76, 78],
       [80, 82, 84, 86, 88, 90, 92, 94, 96, 98]])

In [41]:
# slice a matrix 3 x 2 from the matrix b
c = b[0:3, 3:5] # returns the elements from the first to the third row(0:3) and the fourth to the fifth column(3:5)
c

array([[ 6,  8],
       [26, 28],
       [46, 48]])

In [43]:
b.max() # returns the maximum value of the matrix b

98

In [44]:
b.min() # returns the minimum value of the matrix b

0

In [45]:
b.sum() # returns the sum of all elements in the matrix b

2450

In [46]:
b.max(axis=0) # returns the maximum value of each column

array([80, 82, 84, 86, 88, 90, 92, 94, 96, 98])

In [47]:
b.min(axis=1) # returns the minimum value of each row 

array([ 0, 20, 40, 60, 80])

In [48]:
b

array([[ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18],
       [20, 22, 24, 26, 28, 30, 32, 34, 36, 38],
       [40, 42, 44, 46, 48, 50, 52, 54, 56, 58],
       [60, 62, 64, 66, 68, 70, 72, 74, 76, 78],
       [80, 82, 84, 86, 88, 90, 92, 94, 96, 98]])

In [50]:
b.max(axis=0) # returns the maximum value of each column  (axis=0 means column) 

array([80, 82, 84, 86, 88, 90, 92, 94, 96, 98])

In [51]:
b.max(axis=1) # returns the maximum value of each row  (axis=1 means row)

array([18, 38, 58, 78, 98])

In [52]:
m = np.arange(0,20,2).reshape(5,2) # arguments: start, stop, step  (5 rows and 2 columns) 
m

array([[ 0,  2],
       [ 4,  6],
       [ 8, 10],
       [12, 14],
       [16, 18]])

In [59]:
m + np.array([1,1]) # add the array [1,1] to each row of the matrix m

array([[ 1,  3],
       [ 5,  7],
       [ 9, 11],
       [13, 15],
       [17, 19]])

In [58]:
m * np.array([2,2]) # multiply each row of the matrix m by the array [2,2]

array([[ 0,  4],
       [ 8, 12],
       [16, 20],
       [24, 28],
       [32, 36]])

In [60]:
m / np.array([2,2]) # divide each row of the matrix m by the array [2,2]

array([[0., 1.],
       [2., 3.],
       [4., 5.],
       [6., 7.],
       [8., 9.]])

In [57]:
np.vstack((m, np.array([22,24]))) # add a row to the matrix m (i.e. stack arrays vertically)

array([[ 0,  2],
       [ 4,  6],
       [ 8, 10],
       [12, 14],
       [16, 18],
       [22, 24]])

## How to reverse an array?

In [61]:
b

array([[ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18],
       [20, 22, 24, 26, 28, 30, 32, 34, 36, 38],
       [40, 42, 44, 46, 48, 50, 52, 54, 56, 58],
       [60, 62, 64, 66, 68, 70, 72, 74, 76, 78],
       [80, 82, 84, 86, 88, 90, 92, 94, 96, 98]])

In [62]:
np.flip(b) # flip the matrix b vertically

array([[98, 96, 94, 92, 90, 88, 86, 84, 82, 80],
       [78, 76, 74, 72, 70, 68, 66, 64, 62, 60],
       [58, 56, 54, 52, 50, 48, 46, 44, 42, 40],
       [38, 36, 34, 32, 30, 28, 26, 24, 22, 20],
       [18, 16, 14, 12, 10,  8,  6,  4,  2,  0]])

----

# Day-32B: Example Code


Here’s the complete code snippet from start to finish, including all preprocessing techniques such as handling missing values, outlier detection, normalization, standardization, binning, feature engineering, feature selection, encoding of categorical variables, and splitting the dataset into training and testing sets. I’ll provide explanations for each part of the code.


This is just an example code which will help you to make step by step guid for data pre-processing:

In [63]:
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

# Load the Titanic dataset
titanic = sns.load_dataset('titanic')

# Handling Missing Values
# Impute missing values in 'age' with the mean
imputer = SimpleImputer(strategy='mean')
titanic['age'] = imputer.fit_transform(titanic[['age']])
# Assume 'deck' has too many missing values and drop it
titanic.drop(columns=['deck'], inplace=True)

# Outlier Detection and Removal
# Detect and remove outliers in 'fare' based on the Interquartile Range (IQR)
Q1 = titanic['fare'].quantile(0.25)
Q3 = titanic['fare'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
titanic = titanic[(titanic['fare'] >= lower_bound) & (titanic['fare'] <= upper_bound)]

# Normalization
# Normalize 'fare' to have values between 0 and 1
scaler_min_max = MinMaxScaler()
titanic['fare_normalized'] = scaler_min_max.fit_transform(titanic[['fare']])

# Standardization
# Standardize 'age' to have a mean of 0 and a standard deviation of 1
scaler_std = StandardScaler()
titanic['age_standardized'] = scaler_std.fit_transform(titanic[['age']])

# Binning
# Transform 'age' into three discrete categories
titanic['age_binned'] = pd.cut(titanic['age'], bins=[0, 18, 60, 100], labels=["Child", "Adult", "Senior"])

# Feature Engineering
# Create a new feature 'family_size' from 'sibsp' and 'parch'
titanic['family_size'] = titanic['sibsp'] + titanic['parch'] + 1

# Feature Selection
# Select the top 3 features that have the highest correlation with 'survived'
X = titanic[['pclass', 'age', 'sibsp', 'parch', 'fare_normalized']]
y = titanic['survived']
selector = SelectKBest(score_func=chi2, k=3)
X_selected = selector.fit_transform(X, y)

# Encoding Categorical Variables
# Convert 'sex' into a numerical format using Label Encoding
label_encoder = LabelEncoder()
titanic['sex_encoded'] = label_encoder.fit_transform(titanic['sex'])

# Convert 'embarked' into binary columns using One-Hot Encoding
one_hot_encoder = OneHotEncoder()
encoded_embarked = one_hot_encoder.fit_transform(titanic[['embarked']]).toarray()
embarked_columns = one_hot_encoder.get_feature_names_out(['embarked'])
titanic = titanic.join(pd.DataFrame(encoded_embarked, columns=embarked_columns))

# Data Splitting
# Split the data into training and testing sets
X = titanic[['pclass', 'sex_encoded', 'age_standardized', 'sibsp', 'parch', 'fare_normalized', 'family_size']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now, the dataset is ready for model training

## Explanation of Each Part of the code:


``Load the Titanic dataset:`` We start by loading the Titanic dataset using Seaborn’s built-in dataset loader.

``Handling Missing Values:`` We impute the missing values in the ‘age’ column by replacing them with the mean age. The ‘deck’ column is dropped due to a large number of missing values.

``Outlier Detection and Removal:`` We calculate the Interquartile Range (IQR) for the ‘fare’ column and remove any values that lie outside 1.5 times the IQR from the first and third quartiles, which are considered outliers.

``Normalization:`` We scale the ‘fare’ column so that its values lie between 0 and 1, which ensures that the variable’s scale does not affect the algorithms that assume data is normally distributed.

``Standardization:`` We scale the ‘age’ column to have a mean of 0 and a standard deviation of 1, which is useful for algorithms that assume data is centered around zero.

``Binning:`` We transform the continuous ‘age’ variable into discrete categories (Child, Adult, Senior) to simplify analysis and potentially improve model performance.

``Feature Engineering:`` We create a new feature called ‘family_size’ by adding the number of siblings/spouses (‘sibsp’) and the number of parents/children (‘parch’) and adding one (for the passenger themselves).

``Feature Selection:`` We use the SelectKBest method to select the top 3 features that are most correlated with the ‘survived’ column using the chi-squared test.

``Encoding Categorical Variables:`` We convert categorical variables like ‘sex’ into numerical format using Label Encoding, and ’embarked’ into binary columns using One-Hot Encoding, making them suitable for machine learning models.

``Data Splitting:`` Finally, we split the data into training and testing sets, ensuring that both sets are representative of the overall distribution.

This code prepares the Titanic dataset for predictive modeling, which can now be used to train a machine learning model to predict survival on the Titanic.