In [2]:
import numpy as np

# array creation

In [3]:
# From Python lists
a = np.array([1, 2, 3, 4])

# 2D array (matrix)
b = np.array([[1, 2, 3],
              [4, 5, 6]])

# Explicit dtype (important for ML & memory)
c = np.array([1, 2, 3], dtype=np.float32)

# Zeros, ones, constants
zeros = np.zeros((3, 4))
ones = np.ones((2, 3))
full = np.full((2, 2), 7)

# Identity matrix (used in linear algebra)
identity = np.eye(4)


# random numbers 

In [4]:
# Uniform random numbers
rand_uniform = np.random.rand(3, 3)

# Normal (Gaussian) distribution – used in NN weight initialization
rand_normal = np.random.randn(3, 3)

# Random integers
rand_int = np.random.randint(0, 100, size=(4, 5))

# Set seed (reproducibility – interviews LOVE this)
np.random.seed(42)
weights = np.random.randn(5, 3)


# shape/size/reshaping

In [5]:
x = np.arange(12)

# Shape and dimensions
x.shape
x.ndim
x.size

# Reshape (VERY COMMON)
x_reshaped = x.reshape(3, 4)

# Flatten (for ML models)
x_flat = x_reshaped.flatten()

# Another flatten method (view when possible)
x_ravel = x_reshaped.ravel()


# data extraction

In [6]:
data = np.array([[10, 20, 30],
                 [40, 50, 60],
                 [70, 80, 90]])

# Element access
data[0, 1]

# Row / column slicing
data[1, :]
data[:, 2]

# Sub-matrix slicing
data[0:2, 1:3]


array([[20, 30],
       [50, 60]])

# Boolean masking

In [7]:
scores = np.array([45, 78, 90, 62, 88])

# Boolean condition
mask = scores >= 70

# Filter values
passed = scores[mask]

# One-liner filtering
top_scores = scores[scores > 80]


# vectorized operations

In [8]:
x = np.array([1, 2, 3, 4])
y = np.array([10, 20, 30, 40])

# Element-wise operations
add = x + y
sub = x - y
mul = x * y
div = x / y

# Scalar operations
scaled = x * 0.1


# Broadcasting

In [9]:
X = np.array([[1, 2, 3],
              [4, 5, 6]])

bias = np.array([0.1, 0.2, 0.3])

# Bias addition (used in neural networks)
Z = X + bias


# aggregation and statistics

In [10]:
data = np.array([[10, 20, 30],
                 [40, 50, 60]])

# Basic statistics
data.mean()
data.std()
data.var()
data.sum()
data.min()
data.max()

# Axis-wise (VERY IMPORTANT)
data.mean(axis=0)   # column-wise
data.mean(axis=1)   # row-wise


array([20., 50.])

# linear algebra

In [11]:
A = np.array([[1, 2],
              [3, 4]])

B = np.array([[5, 6],
              [7, 8]])

# Matrix multiplication
C = A @ B
C_dot = np.dot(A, B)

# Transpose
A_T = A.T

# Determinant
det_A = np.linalg.det(A)

# Inverse (used in linear regression)
A_inv = np.linalg.inv(A)

# Eigenvalues & eigenvectors
eig_vals, eig_vecs = np.linalg.eig(A)


In [12]:
v = np.array([3, 4])

# L2 norm (Euclidean)
l2 = np.linalg.norm(v)

# L1 norm
l1 = np.linalg.norm(v, ord=1)

# Distance between vectors
u = np.array([1, 2])
distance = np.linalg.norm(v - u)


# sorting and ranking

In [13]:
arr = np.array([40, 10, 30, 20])

# Sort values
sorted_arr = np.sort(arr)

# Indices that would sort array
sort_idx = np.argsort(arr)

# Top-k values (common in ML)
top2 = np.sort(arr)[-2:]


# count & set operations

In [14]:
labels = np.array([1, 2, 2, 3, 3, 3])

# Unique values
unique_vals = np.unique(labels)

# Unique with counts (VERY COMMON)
vals, counts = np.unique(labels, return_counts=True)


# missing values 

In [None]:
data = np.array([1.0, 2.0, np.nan, 4.0])

# Check NaNs
np.isnan(data)

# Remove NaNs
clean_data = data[~np.isnan(data)]

# Replace NaNs
filled = np.nan_to_num(data, nan=0.0)

# Aggregations ignoring NaNs
np.nanmean(data)


# stacking and concatenations

In [15]:
A = np.array([[1, 2],
              [3, 4]])

B = np.array([[5, 6],
              [7, 8]])

# Vertical stacking
vstack = np.vstack((A, B))

# Horizontal stacking
hstack = np.hstack((A, B))

# Generic concatenate
concat = np.concatenate((A, B), axis=0)


# feature scaling

In [16]:
X = np.array([[1, 200],
              [2, 300],
              [3, 400]])

# Min-Max Scaling
X_min = X.min(axis=0)
X_max = X.max(axis=0)
X_scaled = (X - X_min) / (X_max - X_min)

# Standardization (Z-score)
mean = X.mean(axis=0)
std = X.std(axis=0)
X_standardized = (X - mean) / std


# activation functions 

In [17]:
z = np.array([-2.0, -1.0, 0.0, 1.0, 2.0])

# Sigmoid
sigmoid = 1 / (1 + np.exp(-z))

# ReLU
relu = np.maximum(0, z)

# Softmax (output layer)
exp_z = np.exp(z - np.max(z))
softmax = exp_z / np.sum(exp_z)


# loss functions

In [18]:
# Mean Squared Error
y_true = np.array([3.0, 2.5, 4.0])
y_pred = np.array([2.5, 2.0, 4.5])

mse = np.mean((y_true - y_pred) ** 2)

# Cross-entropy loss
y_true = np.array([1, 0, 0])
y_pred = np.array([0.7, 0.2, 0.1])

cross_entropy = -np.sum(y_true * np.log(y_pred))


# np.where

In [29]:
x = np.array([-2.0, -1.0, 0.0, 1.0, 2.0])

# ReLU using np.where
relu = np.where(x > 0, x, 0)
relu

array([0., 0., 0., 1., 2.])

In [30]:
scores = np.array([45, 78, 90, 62, 88])

# Assign labels based on condition
labels = np.where(scores >= 70, "Pass", "Fail")
labels

array(['Fail', 'Pass', 'Pass', 'Fail', 'Pass'], dtype='<U4')

In [32]:
data = np.array([10, -5, 20, -3, 0])

# Replace negative values with 0
cleaned = np.where(data < 0, 0, data)
cleaned

array([10,  0, 20,  0,  0])

# views/actual copy 

In [34]:
import numpy as np

arr = np.array([1, 2, 3, 4, 5])

slice_view = arr[1:4]
slice_view[:] = 100

print(slice_view)
print(arr)

[100 100 100]
[  1 100 100 100   5]


In [36]:
arr = np.array([1, 2, 3, 4, 5])

slice_copy = arr[1:4].copy()
slice_copy[:] = 200
print(arr)

print(slice_copy)

[1 2 3 4 5]
[200 200 200]


# clip

In [37]:
import numpy as np

y_pred = np.array([0.0, 0.2, 0.8, 1.0])

# Clip probabilities to avoid log(0)
y_pred_safe = np.clip(y_pred, 1e-8, 1 - 1e-8)


In [38]:
features = np.array([10, 20, 300, 25, 30])

# Cap values between 0 and 100
features_clipped = np.clip(features, 0, 100)


In [39]:
gradients = np.array([5.0, -12.0, 3.0, -8.0])

# Prevent exploding gradients
gradients_clipped = np.clip(gradients, -5.0, 5.0)


# Neural Network : Numpy 

In [25]:
import numpy as np

np.random.seed(42)

# Simple dataset (2 features, binary labels)
X = np.array([
    [0.5, 1.5],
    [1.0, 1.0],
    [1.5, 0.5],
    [3.0, 3.5],
    [3.5, 3.0],
    [4.0, 4.0]
])

y = np.array([[0], [0], [0], [1], [1], [1]])

# Number of samples
m = X.shape[0]


def sigmoid(z):
    return 1 / (1 + np.exp(-z))


def sigmoid_derivative(a):
    # a is sigmoid(z)
    return a * (1 - a)

def binary_cross_entropy(y_true, y_pred):
    epsilon = 1e-8  # to avoid log(0)
    return -np.mean(
        y_true * np.log(y_pred + epsilon) +
        (1 - y_true) * np.log(1 - y_pred + epsilon)
    )
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.1):
        # Weight initialization (Gaussian)
        self.W1 = np.random.randn(input_size, hidden_size) * 0.01
        self.b1 = np.zeros((1, hidden_size))

        self.W2 = np.random.randn(hidden_size, output_size) * 0.01
        self.b2 = np.zeros((1, output_size))

        self.learning_rate = learning_rate

    def forward(self, X):
        # Layer 1
        self.Z1 = X @ self.W1 + self.b1
        self.A1 = sigmoid(self.Z1)

        # Output layer
        self.Z2 = self.A1 @ self.W2 + self.b2
        self.A2 = sigmoid(self.Z2)

        return self.A2

    def backward(self, X, y):
        m = X.shape[0]

        # Output layer gradients
        dZ2 = self.A2 - y
        dW2 = (1 / m) * (self.A1.T @ dZ2)
        db2 = (1 / m) * np.sum(dZ2, axis=0, keepdims=True)

        # Hidden layer gradients
        dA1 = dZ2 @ self.W2.T
        dZ1 = dA1 * sigmoid_derivative(self.A1)
        dW1 = (1 / m) * (X.T @ dZ1)
        db1 = (1 / m) * np.sum(dZ1, axis=0, keepdims=True)

        # Gradient descent update
        self.W2 -= self.learning_rate * dW2
        self.b2 -= self.learning_rate * db2
        self.W1 -= self.learning_rate * dW1
        self.b1 -= self.learning_rate * db1


    # 1. We compute the gradient of the loss w.r.t. weights using the chain rule, 
    # 2. propagate errors backward layer by layer, 
    # 3. and update parameters using gradient descent
    
    def train(self, X, y, epochs=1000):
        for epoch in range(epochs):
            y_pred = self.forward(X)
            loss = binary_cross_entropy(y, y_pred)
            self.backward(X, y)

            if epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {loss:.4f}")

    def predict(self, X, threshold=0.5):
        probs = self.forward(X)
        return (probs >= threshold).astype(int)


In [26]:
# Initialize model
nn = NeuralNetwork(
    input_size=2,
    hidden_size=4,
    output_size=1,
    learning_rate=0.5
)

# Train
nn.train(X, y, epochs=1000)


Epoch 0, Loss: 0.6932
Epoch 100, Loss: 0.2996
Epoch 200, Loss: 0.0518
Epoch 300, Loss: 0.0224
Epoch 400, Loss: 0.0135
Epoch 500, Loss: 0.0095
Epoch 600, Loss: 0.0072
Epoch 700, Loss: 0.0058
Epoch 800, Loss: 0.0048
Epoch 900, Loss: 0.0041


In [27]:
predictions = nn.predict(X)
print("Predictions:\n", predictions)
print("Actual:\n", y)

Predictions:
 [[0]
 [0]
 [0]
 [1]
 [1]
 [1]]
Actual:
 [[0]
 [0]
 [0]
 [1]
 [1]
 [1]]


# Transformer Encoder output : Numpy 

In [40]:
import numpy as np

# -------------------------------
# Reproducibility
# -------------------------------
np.random.seed(42)

# -------------------------------
# 1. Input sentence (token IDs)
# -------------------------------
# Assume sentence already tokenized
tokens = np.array([4, 12, 7, 9, 3, 10])  # length = 6

seq_len = len(tokens)
vocab_size = 50
d_model = 8  # embedding dimension

# -------------------------------
# 2. Token Embeddings
# -------------------------------
# Learned embedding matrix
token_embedding_matrix = np.random.randn(vocab_size, d_model)

# Lookup embeddings
token_embeddings = token_embedding_matrix[tokens]
# Shape: (seq_len, d_model)

# -------------------------------
# 3. Positional Embeddings
# -------------------------------
# Learned positional embeddings
positional_embedding_matrix = np.random.randn(seq_len, d_model)

positional_embeddings = positional_embedding_matrix
# Shape: (seq_len, d_model)

# -------------------------------
# 4. Encoder Input
# -------------------------------
# Token + position embeddings
X = token_embeddings + positional_embeddings
# Shape: (seq_len, d_model)

# -------------------------------
# 5. Weight matrices for Q, K, V
# -------------------------------
W_Q = np.random.randn(d_model, d_model)
W_K = np.random.randn(d_model, d_model)
W_V = np.random.randn(d_model, d_model)

# -------------------------------
# 6. Create Queries, Keys, Values
# -------------------------------
Q = X @ W_Q
K = X @ W_K
V = X @ W_V

# Shapes:
# Q, K, V -> (seq_len, d_model)

# -------------------------------
# 7. Scaled Dot-Product Attention
# -------------------------------

# Dot product between Q and Kᵀ
scores = Q @ K.T
# Shape: (seq_len, seq_len)

# Scale by sqrt(d_model)
scaled_scores = scores / np.sqrt(d_model)

# -------------------------------
# 8. Softmax (Attention Weights)
# -------------------------------
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

attention_weights = softmax(scaled_scores)
# Shape: (seq_len, seq_len)

# -------------------------------
# 9. Weighted sum of Values
# -------------------------------
attention_output = attention_weights @ V
# Shape: (seq_len, d_model)

# -------------------------------
# 10. Output Projection
# -------------------------------
W_O = np.random.randn(d_model, d_model)

encoder_output = attention_output @ W_O
# Shape: (seq_len, d_model)

# -------------------------------
# Final Encoder Output
# -------------------------------
encoder_output


array([[ -1.34903236,   1.59306367,   5.97405543, -10.07031512,
          2.02015256,   0.08518497, -23.75934035, -10.59915599],
       [ -1.66032544,  -2.04324826,  -1.20655615, -14.04199566,
          1.9208312 ,   4.09177349, -27.47093388,  -5.75217534],
       [ -2.7502641 ,   7.09734995,  -9.5584203 ,  -1.2801011 ,
         18.56648003, -12.37400589,  21.17231967,  29.18579374],
       [ -1.98435411,   1.3733636 ,  -7.7065881 ,  24.94931337,
        -31.03617656,  15.301169  ,  30.50316471, -12.4812306 ],
       [  3.08908097,   2.9440524 ,   8.55491482,  19.86921318,
        -13.11686279,  -1.36116844,  25.60197251,  -9.20708819],
       [ -1.9774402 ,   1.32215887,  -7.5889322 ,  24.28450688,
        -30.44183046,  15.08031294,  29.53045714, -12.34457851]])