In [1]:
# Importing Numpy and Pandas libraries
import numpy as np
import pandas as pd

# NumPy Basics

#### Creating Arrays



In [2]:
# NumPy Array Creation: Exploring Different Techniques

# Creating a one-dimensional array
arr1 = np.array([1, 2, 3, 4, 5])
print("\n1D Array (arr1):")
print(arr1)
print("Data Type:", arr1.dtype)  # Illustrate data type inference
print("Shape:", arr1.shape)       # Display array dimensions

# Creating a two-dimensional array
arr2 = np.array([[1, 2, 3], [4, 5, 6]])
print("\n2D Array (arr2):")
print(arr2)
print("Data Type:", arr2.dtype)
print("Shape:", arr2.shape)

# Generating an array filled with zeros
zeros = np.zeros((3, 3))
print("\nZeros Array (zeros):")
print(zeros)

# Generating an array filled with ones
ones = np.ones((2, 4))
print("\nOnes Array (ones):")
print(ones)

# Creating an uninitialized array (may contain arbitrary values)
empty = np.empty((2, 2))
print("\nEmpty Array (empty):")
print(empty)  

# Generating an array with a sequence of numbers
range_arr = np.arange(10)
print("\nRange Array (range_arr):")
print(range_arr)


1D Array (arr1):
[1 2 3 4 5]
Data Type: int64
Shape: (5,)

2D Array (arr2):
[[1 2 3]
 [4 5 6]]
Data Type: int64
Shape: (2, 3)

Zeros Array (zeros):
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]

Ones Array (ones):
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]]

Empty Array (empty):
[[4.65919360e-310 0.00000000e+000]
 [8.67144183e+199 5.96673156e-154]]

Range Array (range_arr):
[0 1 2 3 4 5 6 7 8 9]


#### NumPy Array Operations

In [3]:
# Creating sample arrays for demonstration
arr = np.array([1, 2, 3, 4])
arr2 = np.array([5, 6, 7, 8])

# Element-wise addition of two arrays
sum_arr = arr + arr2
print("\nElement-wise Sum (arr + arr2):", sum_arr)

# Element-wise subtraction of two arrays
diff_arr = arr - arr2
print("\nElement-wise Difference (arr - arr2):", diff_arr)

# Element-wise multiplication of two arrays
product_arr = arr * arr2
print("\nElement-wise Product (arr * arr2):", product_arr)

# Element-wise division of two arrays
quotient_arr = arr / arr2
print("\nElement-wise Quotient (arr / arr2):", quotient_arr)

# Adding a scalar value to each element of the array
scalar_add = arr + 2
print("\nScalar Addition (arr + 2):", scalar_add)

# Multiplying each element of the array by a scalar value
scalar_mult = arr * 3
print("\nScalar Multiplication (arr * 3):", scalar_mult)



Element-wise Sum (arr + arr2): [ 6  8 10 12]

Element-wise Difference (arr - arr2): [-4 -4 -4 -4]

Element-wise Product (arr * arr2): [ 5 12 21 32]

Element-wise Quotient (arr / arr2): [0.2        0.33333333 0.42857143 0.5       ]

Scalar Addition (arr + 2): [3 4 5 6]

Scalar Multiplication (arr * 3): [ 3  6  9 12]


#### Indexing and Slicing

In [4]:
# Creating a sample array
arr = np.array([1, 2, 3, 4, 5, 6, 7, 8])
print("\nOriginal Array (arr):", arr)

# Basic Indexing: Accessing Individual Elements
first_element = arr[0]
print("\nFirst Element (arr[0]):", first_element)  # Index starts at 0

fifth_element = arr[4]
print("Fifth Element (arr[4]):", fifth_element)

# Slicing: Extracting Subarrays
slice_1_to_4 = arr[1:5]  # Note: slicing is upper-bound exclusive
print("\nSlice from Index 1 to 4 (arr[1:5]):", slice_1_to_4)

slice_3_onwards = arr[3:]
print("Slice from Index 3 Onwards (arr[3:]):", slice_3_onwards)

slice_upto_5 = arr[:6]
print("Slice Up to Index 5 (arr[:6]):", slice_upto_5)

# Advanced Indexing: Selecting Elements at Specific Indices
indices = [1, 3, 5]
selected_elements = arr[indices]
print("\nElements at Indices 1, 3, and 5 (arr[[1, 3, 5]]):", selected_elements)



Original Array (arr): [1 2 3 4 5 6 7 8]

First Element (arr[0]): 1
Fifth Element (arr[4]): 5

Slice from Index 1 to 4 (arr[1:5]): [2 3 4 5]
Slice from Index 3 Onwards (arr[3:]): [4 5 6 7 8]
Slice Up to Index 5 (arr[:6]): [1 2 3 4 5 6]

Elements at Indices 1, 3, and 5 (arr[[1, 3, 5]]): [2 4 6]


#### Shape & Reshape

In [5]:
# Creating a sample 2D array
arr = np.array([[1, 2, 3], [4, 5, 6]])
print("\nOriginal Array (arr):")
print(arr)

# Retrieving the shape of the array (number of rows, number of columns)
original_shape = arr.shape
print("\nShape of Original Array (arr.shape):", original_shape)

# Reshaping the array into a new configuration (3 rows, 2 columns)
reshaped_arr = arr.reshape((3, 2))  # Ensure the total number of elements remains the same
print("\nReshaped Array (arr.reshape((3, 2))):")
print(reshaped_arr)

# Flattening the array into a 1D array
flattened_arr = arr.reshape(-1)
print("\nFlattened Array (arr.reshape(-1)):", flattened_arr)

# Automatically calculating one dimension for reshaping (useful for flattening)
reshaped_2d_auto = arr.reshape(3, -1)
print("\nReshaped with Automatic Dimension Calculation:", reshaped_2d_auto)



Original Array (arr):
[[1 2 3]
 [4 5 6]]

Shape of Original Array (arr.shape): (2, 3)

Reshaped Array (arr.reshape((3, 2))):
[[1 2]
 [3 4]
 [5 6]]

Flattened Array (arr.reshape(-1)): [1 2 3 4 5 6]

Reshaped with Automatic Dimension Calculation: [[1 2]
 [3 4]
 [5 6]]


#### Universal & Statistical Functions

In [6]:
# Creating a sample array
arr = np.array([1, 2, 3, 4, 5])
print("\nOriginal Array (arr):", arr)

# Mathematical ufuncs:
sqrt_arr = np.sqrt(arr)
print("\nSquare Roots (np.sqrt(arr)):", sqrt_arr)

exp_arr = np.exp(arr)
print("\nExponentials (np.exp(arr)):", exp_arr)

sin_arr = np.sin(arr)
print("\nSines (np.sin(arr)):", sin_arr)

log_arr = np.log(arr) # Natural logarithm
print("\nNatural Logarithms (np.log(arr)):", log_arr)


# Statistical ufuncs:
mean = np.mean(arr)
print("\nMean (np.mean(arr)):", mean)

std_dev = np.std(arr)
print("\nStandard Deviation (np.std(arr)):", std_dev)

var = np.var(arr)
print("\nVariance (np.var(arr)):", var)


Original Array (arr): [1 2 3 4 5]

Square Roots (np.sqrt(arr)): [1.         1.41421356 1.73205081 2.         2.23606798]

Exponentials (np.exp(arr)): [  2.71828183   7.3890561   20.08553692  54.59815003 148.4131591 ]

Sines (np.sin(arr)): [ 0.84147098  0.90929743  0.14112001 -0.7568025  -0.95892427]

Natural Logarithms (np.log(arr)): [0.         0.69314718 1.09861229 1.38629436 1.60943791]

Mean (np.mean(arr)): 3.0

Standard Deviation (np.std(arr)): 1.4142135623730951

Variance (np.var(arr)): 2.0


#### Linear Algebra

In [7]:
# Creating a sample 2D array (matrix)
arr = np.array([[1, 2], [3, 4]])
print("\nMatrix (arr):\n", arr)

# Calculating the dot product (matrix multiplication) of the matrix with itself
dot_product = np.dot(arr, arr)
print("\nDot Product (arr.dot(arr)):\n", dot_product)

# Alternative syntax for dot product (using the @ operator)
dot_product_alt = arr @ arr
print("\nDot Product Alternative (arr @ arr):\n", dot_product_alt)

# Calculating the determinant of the matrix
det = np.linalg.det(arr)
print("\nDeterminant (np.linalg.det(arr)):", det)

# Calculating the inverse of the matrix (if it exists)
try:
    inv = np.linalg.inv(arr)
    print("\nInverse (np.linalg.inv(arr)):\n", inv)

    # Verifying the inverse (multiplying a matrix by its inverse should yield the identity matrix)
    identity = arr @ inv
    print("\nVerification (arr @ inv):\n", np.round(identity))  # Round for display purposes
except np.linalg.LinAlgError as e:
    print("\nError:", e)  # Handle the case where the matrix is not invertible (singular)



Matrix (arr):
 [[1 2]
 [3 4]]

Dot Product (arr.dot(arr)):
 [[ 7 10]
 [15 22]]

Dot Product Alternative (arr @ arr):
 [[ 7 10]
 [15 22]]

Determinant (np.linalg.det(arr)): -2.0000000000000004

Inverse (np.linalg.inv(arr)):
 [[-2.   1. ]
 [ 1.5 -0.5]]

Verification (arr @ inv):
 [[1. 0.]
 [0. 1.]]


#### Advanced Indexing and Slicing

In [8]:
# Creating a sample 2D array
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print("\nOriginal Array (arr):\n", arr)

# Boolean Indexing: Filtering Based on Conditions
bool_idx = arr > 5
filtered_arr = arr[bool_idx]
print("\nElements Greater Than 5 (arr[arr > 5]):", filtered_arr)

# More complex boolean indexing: Multiple conditions
bool_idx_complex = (arr > 2) & (arr < 8)  # Combine conditions with & (and) or | (or)
filtered_arr_complex = arr[bool_idx_complex]
print("\nElements Between 2 and 8 (Exclusive):", filtered_arr_complex)

# Fancy Indexing: Selecting Elements Using Coordinate Pairs
row_indices = [0, 1]
col_indices = [1, 2]
fancy_indexed_elements = arr[row_indices, col_indices]
print("\nElements at (0, 1) and (1, 2) (arr[[0, 1], [1, 2]]):", fancy_indexed_elements)

# Combining Fancy Indexing and Slicing: Extract a submatrix
sub_matrix = arr[1:, 1:]  # Get elements from row 1 onwards and column 1 onwards
print("\nSubmatrix (arr[1:, 1:]):\n", sub_matrix)



Original Array (arr):
 [[1 2 3]
 [4 5 6]
 [7 8 9]]

Elements Greater Than 5 (arr[arr > 5]): [6 7 8 9]

Elements Between 2 and 8 (Exclusive): [3 4 5 6 7]

Elements at (0, 1) and (1, 2) (arr[[0, 1], [1, 2]]): [2 6]

Submatrix (arr[1:, 1:]):
 [[5 6]
 [8 9]]


# Pandas Basics

In [9]:
# Creating a DataFrame from a dictionary
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [24, 27, 22],
    'City': ['New York', 'San Francisco', 'Los Angeles']
}

df = pd.DataFrame(data)

# Display the DataFrame
print("\nDataFrame (df):")
print(df)

# Accessing column data
names = df['Name']  # Accessing by column name
print("\nNames Column (df['Name']):", names)

ages = df.Age  # Accessing using dot notation
print("Ages Column (df.Age):", ages)

# Summary statistics
summary_stats = df.describe()  # Generates summary statistics for numerical columns
print("\nSummary Statistics (df.describe()):")
print(summary_stats)

# Display the first 5 rows
first_5_rows = df.head()
print("\nFirst 5 Rows (df.head()):")
print(first_5_rows)

# Display the last 3 rows
last_3_rows = df.tail(3)  # Display the last 3 rows
print("\nLast 3 Rows (df.tail(3)):")
print(last_3_rows)

# Getting information about the DataFrame
print("\nDataFrame Information (df.info()):")
df.info()


DataFrame (df):
      Name  Age           City
0    Alice   24       New York
1      Bob   27  San Francisco
2  Charlie   22    Los Angeles

Names Column (df['Name']): 0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object
Ages Column (df.Age): 0    24
1    27
2    22
Name: Age, dtype: int64

Summary Statistics (df.describe()):
             Age
count   3.000000
mean   24.333333
std     2.516611
min    22.000000
25%    23.000000
50%    24.000000
75%    25.500000
max    27.000000

First 5 Rows (df.head()):
      Name  Age           City
0    Alice   24       New York
1      Bob   27  San Francisco
2  Charlie   22    Los Angeles

Last 3 Rows (df.tail(3)):
      Name  Age           City
0    Alice   24       New York
1      Bob   27  San Francisco
2  Charlie   22    Los Angeles

DataFrame Information (df.info()):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ---

#### Indexing & Slicing

In [10]:
# Column Selection: Accessing Specific Columns
names_column = df['Name']
print("\nNames Column (df['Name']):", names_column)

multiple_columns = df[['Name', 'City']]  # Select multiple columns using a list
print("\nName and City Columns (df[['Name', 'City']]):")
print(multiple_columns)


# Row Selection by Index (Position): Using `.iloc` 
first_row = df.iloc[0]
print("\nFirst Row (df.iloc[0]):", first_row)  # Returns a Series (row)

multiple_rows_by_index = df.iloc[[0, 2]]  # Select multiple rows by index
print("\nRows 0 and 2 (df.iloc[[0, 2]]):")
print(multiple_rows_by_index)


# Setting an Index Column for Label-based Indexing
df.set_index('Name', inplace=True)  # Set 'Name' as the index
print("\nDataFrame with 'Name' as Index:")
print(df)


# Row Selection by Label (Index Value): Using `.loc`
alice_row = df.loc['Alice']
print("\nRow for Alice (df.loc['Alice']):", alice_row)

multiple_rows_by_label = df.loc[['Alice', 'Charlie']]  # Select multiple rows by label
print("\nRows for Alice and Charlie (df.loc[['Alice', 'Charlie']]):")
print(multiple_rows_by_label)


# Slicing: Extracting Subsets of Rows and Columns
first_two_rows = df.iloc[:2]  # Slice using integer indices (0-based, upper-bound exclusive)
print("\nFirst Two Rows (df.iloc[:2]):")
print(first_two_rows)

#Slicing rows by labels
bob_to_charlie = df.loc['Bob':'Charlie']
print("\nRows from Bob to Charlie (df.loc['Bob':'Charlie']):")
print(bob_to_charlie) # Include both 'Bob' and 'Charlie'

# Combining slicing for rows and column selection
first_two_rows_age_city = df.iloc[:2][['Age', 'City']]  # Slice rows, then select columns
print("\nFirst Two Rows, Age and City Columns:")
print(first_two_rows_age_city)



Names Column (df['Name']): 0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object

Name and City Columns (df[['Name', 'City']]):
      Name           City
0    Alice       New York
1      Bob  San Francisco
2  Charlie    Los Angeles

First Row (df.iloc[0]): Name       Alice
Age           24
City    New York
Name: 0, dtype: object

Rows 0 and 2 (df.iloc[[0, 2]]):
      Name  Age         City
0    Alice   24     New York
2  Charlie   22  Los Angeles

DataFrame with 'Name' as Index:
         Age           City
Name                       
Alice     24       New York
Bob       27  San Francisco
Charlie   22    Los Angeles

Row for Alice (df.loc['Alice']): Age           24
City    New York
Name: Alice, dtype: object

Rows for Alice and Charlie (df.loc[['Alice', 'Charlie']]):
         Age         City
Name                     
Alice     24     New York
Charlie   22  Los Angeles

First Two Rows (df.iloc[:2]):
       Age           City
Name                     
Alice   24       New Y

#### Data Cleaning

In [11]:
# Dropping rows with missing values (NaN)
df_dropped = df.dropna()  
print("\nDataFrame after Dropping Rows with NaN:")
print(df_dropped)

# Filling missing values (NaN) with a specific value
df_filled = df.fillna(0)  
print("\nDataFrame after Filling NaN with 0:")
print(df_filled)

# Filling with different values for different columns
df_filled_mixed = df.fillna({'Age': 25, 'City': 'Unknown'})  
print("\nDataFrame after Filling NaN with Specific Values:")
print(df_filled_mixed)

# Forward and Backward Filling
df_ffill = df.fillna(method='ffill') # Forward fill (use previous valid value)
print("\nDataFrame after Forward Filling:")
print(df_ffill)

df_bfill = df.fillna(method='bfill') # Backward fill (use next valid value)
print("\nDataFrame after Backward Filling:")
print(df_bfill)

# Renaming columns
df_renamed = df.rename(columns={'Age': 'Years'})  
print("\nDataFrame after Renaming 'Age' to 'Years':")
print(df_renamed)



DataFrame after Dropping Rows with NaN:
         Age           City
Name                       
Alice     24       New York
Bob       27  San Francisco
Charlie   22    Los Angeles

DataFrame after Filling NaN with 0:
         Age           City
Name                       
Alice     24       New York
Bob       27  San Francisco
Charlie   22    Los Angeles

DataFrame after Filling NaN with Specific Values:
         Age           City
Name                       
Alice     24       New York
Bob       27  San Francisco
Charlie   22    Los Angeles

DataFrame after Forward Filling:
         Age           City
Name                       
Alice     24       New York
Bob       27  San Francisco
Charlie   22    Los Angeles

DataFrame after Backward Filling:
         Age           City
Name                       
Alice     24       New York
Bob       27  San Francisco
Charlie   22    Los Angeles

DataFrame after Renaming 'Age' to 'Years':
         Years           City
Name                        

  df_ffill = df.fillna(method='ffill') # Forward fill (use previous valid value)
  df_bfill = df.fillna(method='bfill') # Backward fill (use next valid value)


#### Grouping and Aggregating

In [12]:
# Grouping by 'City'
grouped = df.groupby('City')

# Aggregating: Calculating the mean age per city
mean_age_by_city = grouped['Age'].mean()
print("\nMean Age by City (grouped['Age'].mean()):")
print(mean_age_by_city)

# More Aggregations: Different statistics per city
agg_stats = grouped['Age'].agg(['mean', 'median', 'min', 'max', 'count'])  # Multiple aggregations
print("\nAggregation Statistics by City:")
print(agg_stats)

# Custom Aggregation Function: Using a lambda function to calculate the range (max - min)
range_age_by_city = grouped['Age'].agg(lambda x: x.max() - x.min())  # lambda function for custom aggregation
print("\nRange of Ages by City:")
print(range_age_by_city)

# Iterating over groups
print("\nIterating Over Groups:")
for city, group_data in grouped:
    print(f"\nCity: {city}")
    print(group_data)

# Selecting a specific group
ny_group = grouped.get_group('New York')
print("\nData for 'New York':")
print(ny_group)



Mean Age by City (grouped['Age'].mean()):
City
Los Angeles      22.0
New York         24.0
San Francisco    27.0
Name: Age, dtype: float64

Aggregation Statistics by City:
               mean  median  min  max  count
City                                        
Los Angeles    22.0    22.0   22   22      1
New York       24.0    24.0   24   24      1
San Francisco  27.0    27.0   27   27      1

Range of Ages by City:
City
Los Angeles      0
New York         0
San Francisco    0
Name: Age, dtype: int64

Iterating Over Groups:

City: Los Angeles
         Age         City
Name                     
Charlie   22  Los Angeles

City: New York
       Age      City
Name                
Alice   24  New York

City: San Francisco
      Age           City
Name                    
Bob    27  San Francisco

Data for 'New York':
       Age      City
Name                
Alice   24  New York


#### Merging & Joining

In [13]:
# Creating sample DataFrames
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']}, index=[0, 1, 2])
df2 = pd.DataFrame({'A': ['A3', 'A4', 'A5'], 'B': ['B3', 'B4', 'B5']}, index=[3, 4, 5])

# Concatenation: Combining DataFrames along an axis
concatenated_df = pd.concat([df1, df2])
print("\nConcatenated DataFrame:")
print(concatenated_df)
print(f"Shape of concatenated DataFrame: {concatenated_df.shape}")
# Concatenation by column
concat_cols_df = pd.concat([df1,df2], axis=1)
print("\nConcatenated DataFrame (By Column):")
print(concat_cols_df)

# Merging: Combining DataFrames based on common columns or indices

df3 = pd.DataFrame({'key': ['K0', 'K1', 'K2'], 'C': ['C0', 'C1', 'C2']})
df4 = pd.DataFrame({'key': ['K0', 'K2', 'K3'], 'D': ['D0', 'D2', 'D3']})

# Inner join: Only rows with matching keys are included
inner_merged_df = pd.merge(df3, df4, on='key')  
print("\nInner Join on 'key':")
print(inner_merged_df)

# Outer join: All rows are included, filling missing values with NaN
outer_merged_df = pd.merge(df3, df4, on='key', how='outer')
print("\nOuter Join on 'key':")
print(outer_merged_df)

# Left join: All rows from the left DataFrame are included
left_merged_df = pd.merge(df3, df4, on='key', how='left')
print("\nLeft Join on 'key':")
print(left_merged_df)

# Right join: All rows from the right DataFrame are included
right_merged_df = pd.merge(df3, df4, on='key', how='right')
print("\nRight Join on 'key':")
print(right_merged_df)



Concatenated DataFrame:
    A   B
0  A0  B0
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4
5  A5  B5
Shape of concatenated DataFrame: (6, 2)

Concatenated DataFrame (By Column):
     A    B    A    B
0   A0   B0  NaN  NaN
1   A1   B1  NaN  NaN
2   A2   B2  NaN  NaN
3  NaN  NaN   A3   B3
4  NaN  NaN   A4   B4
5  NaN  NaN   A5   B5

Inner Join on 'key':
  key   C   D
0  K0  C0  D0
1  K2  C2  D2

Outer Join on 'key':
  key    C    D
0  K0   C0   D0
1  K1   C1  NaN
2  K2   C2   D2
3  K3  NaN   D3

Left Join on 'key':
  key   C    D
0  K0  C0   D0
1  K1  C1  NaN
2  K2  C2   D2

Right Join on 'key':
  key    C   D
0  K0   C0  D0
1  K2   C2  D2
2  K3  NaN  D3
