In [1]:
import pandas as pd
import numpy as np

try:
    train_tfidf = pd.read_csv('/kaggle/input/50-007-machine-learning-summer-2024/train_tfidf_features.csv')
    test_data_ids = pd.read_csv('/kaggle/input/50-007-machine-learning-summer-2024/test.csv')['id']
    test_tfidf = pd.read_csv('/kaggle/input/50-007-machine-learning-summer-2024/test_tfidf_features.csv')
    train_labels = pd.read_csv('/kaggle/input/50-007-machine-learning-summer-2024/train_tfidf_features.csv')['label']
    print("Files loaded successfully.")
except FileNotFoundError as e:
    print(f"File not found: {e}")
    raise
except KeyError as e:
    print(f"Column not found: {e}")
    raise

X = train_tfidf.drop(columns=['label', 'id'], errors='ignore')
y = train_labels
X_test = test_tfidf.drop(columns=['id'], errors='ignore')
print("X shape:", X.shape)
print("y shape:", y.shape)
print("X head:")
print(X.head(30))
print("y head:")
print(y.head(30))

def sigmoid(z):
    return 1.0 / (1 + np.exp(-z))

def loss(y, y_hat):
    # Add a small value to avoid log(0)
    y_hat = np.clip(y_hat, 1e-10, 1 - 1e-10)
    loss = -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
    return loss

def gradients(X, y, y_hat):
    m = X.shape[0]
    dw = (1 / m) * np.dot(X.T, (y_hat - y))
    db = (1 / m) * np.sum((y_hat - y))
    return dw, db

def normalize(X):
    # Normalizing all the n features of X.
    std_dev = X.std(axis=0)
    std_dev[std_dev == 0] = 1  # Avoid division by zero
    return (X - X.mean(axis=0)) / std_dev

def train(X, y, bs, epochs, lr):
    w = np.zeros((X.shape[1], 1))
    b = 0
    x = normalize(X)
    y = y.values.reshape(-1, 1)

    print("new x head:")
    print(x.head(30))

    losses = []
    for epoch in range(epochs):
        for i in range((X.shape[0] - 1) // bs + 1):
            start_i = i * bs
            end_i = start_i + bs
            xb = x[start_i:end_i]
            yb = y[start_i:end_i]
            y_hat = sigmoid(np.dot(xb, w) + b)
            dw, db = gradients(xb, yb, y_hat)
            w -= lr * dw
            b -= lr * db
        l = loss(y, sigmoid(np.dot(x, w) + b))
        losses.append(l)
        if epoch % 10 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Loss: {l:.4f}")
    return w, b, losses

def predict(X, w, b):
    x = normalize(X)
    preds = sigmoid(np.dot(x, w) + b)
    pred_class = [1 if i > 0.5 else 0 for i in preds]
    return np.array(pred_class)

w, b, l = train(X, y, bs=100, epochs=1000, lr=0.01)

y_pred = predict(X_test, w, b)
print("Predictions:", y_pred)
predictions_df = pd.DataFrame({'Id':test_data_ids,'label': y_pred})
predictions_df.to_csv('LogRed_Prediction.csv', index=False)

Files loaded successfully.
X shape: (17184, 5000)
y shape: (17184,)
X head:
      0    1    2    3    4    5    6    7    8    9  ...  4990  4991  4992  \
0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
1   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
2   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
3   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
4   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
5   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
6   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
7   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
8   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
9   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  .