# **Data science final project**

**Uploading libraries**

In [12]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

**Uploading the dataset**

In [13]:
data=pd.read_csv('Student_Performance.csv')
data

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0
...,...,...,...,...,...,...
9995,1,49,Yes,4,2,23.0
9996,7,64,Yes,8,5,58.0
9997,6,83,Yes,8,5,74.0
9998,9,97,Yes,7,0,95.0


**Convert the string values into numerical values**

In [14]:
data['Extracurricular Activities'] =data['Extracurricular Activities'].astype('category')
data['Extracurricular Activities'] =data['Extracurricular Activities'].cat.codes
data

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,1,9,1,91.0
1,4,82,0,4,2,65.0
2,8,51,1,7,2,45.0
3,5,52,1,5,2,36.0
4,7,75,0,8,5,66.0
...,...,...,...,...,...,...
9995,1,49,1,4,2,23.0
9996,7,64,1,8,5,58.0
9997,6,83,1,8,5,74.0
9998,9,97,1,7,0,95.0


**Data cleaning**

In [15]:
data.isnull().sum()

Hours Studied                       0
Previous Scores                     0
Extracurricular Activities          0
Sleep Hours                         0
Sample Question Papers Practiced    0
Performance Index                   0
dtype: int64

In [16]:
data=df = data.dropna()
data

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,1,9,1,91.0
1,4,82,0,4,2,65.0
2,8,51,1,7,2,45.0
3,5,52,1,5,2,36.0
4,7,75,0,8,5,66.0
...,...,...,...,...,...,...
9995,1,49,1,4,2,23.0
9996,7,64,1,8,5,58.0
9997,6,83,1,8,5,74.0
9998,9,97,1,7,0,95.0


In [25]:
import numpy as np

# First, make sure 'data' is defined as a NumPy array
# For example:
data = np.array([[1, 2, 3, 4, 5, 6],
                 [7, 8, 9, 10, 11, 12],
                 [13, 14, 15, 16, 17, 18]])  # Example data with 6 columns

# Now create X using the first 5 columns
X = np.c_[data[:, 0:5]]  # This selects all rows and columns 0 through 4

# Display X
X

array([[ 1,  2,  3,  4,  5],
       [ 7,  8,  9, 10, 11],
       [13, 14, 15, 16, 17]])

In [27]:
y = np.c_[data[:, 5].reshape(-1, 1)]
y

array([[ 6],
       [12],
       [18]])

**Normalization**

In [28]:
X_mean=np.mean(X, axis=0)
X_std = np.std(X, axis=0)
X_normalized=(X-X_mean)/X_std


X_normalized

array([[-1.22474487, -1.22474487, -1.22474487, -1.22474487, -1.22474487],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 1.22474487,  1.22474487,  1.22474487,  1.22474487,  1.22474487]])

In [32]:
# First define m (number of samples) before using it
# Assuming X_normalized is already defined and is your feature matrix
m = X_normalized.shape[0]  # Get number of samples from X_normalized

# Now use m in your code
X_extend = np.c_[np.ones((m, 1)), X_normalized] 
num_features = X_extend.shape[1]
theta = np.zeros((num_features, 1))

**Cost function**

In [33]:
def costfunction(X, y, theta):
    """Calculates the mean squared error cost."""
    m = len(y)
    h = X.dot(theta)
    j = np.sum((h - y) ** 2) / (2 * m)
    return j

In [35]:
def gradientdescent(X, y, theta, alpha, num_iters):
    """Performs vectorized gradient descent to minimize the cost function."""
    m = len(y)
    J_history = np.zeros((num_iters, 1))

    for i in range(num_iters):
        # Calculate the hypothesis (h) and the error
        h = X.dot(theta)
        error = h - y

        # Vectorized update of all theta parameters simultaneously
        # X.T.dot(error) gives the gradient vector (6, 1)
        theta = theta - (alpha / m) * X.T.dot(error)

        # Store the cost for this iteration
        J_history[i] = costfunction(X, y, theta)

    return theta, J_history

In [36]:
alpha = 0.01      # Learning rate
num_iters = 1500  # Number of iterations

# Run Gradient Descent
theta_final, J_history = gradientdescent(X_extend, y, theta, alpha, num_iters)

print("--- RESULTS ---")
print(f"Final Parameters (Theta, {num_features} total):\n", theta_final)
print(f"Final Cost (J): {costfunction(X_extend, y, theta_final):.4f}")

--- RESULTS ---
Final Parameters (Theta, 6 total):
 [[11.9999966]
 [ 0.9797959]
 [ 0.9797959]
 [ 0.9797959]
 [ 0.9797959]
 [ 0.9797959]]
Final Cost (J): 0.0000


In [37]:
import numpy as np

# Define these variables before using them in the function call
# These should be calculated from your training data
X_mean = np.array([0, 0, 0, 0, 0])  # Replace with actual means of your features
X_std = np.array([1, 1, 1, 1, 1])   # Replace with actual standard deviations
theta_final = np.array([0, 0, 0, 0, 0, 0])  # Replace with your trained model parameters

def predict_performance_index(Hours, Scores, Extra, Sleep, Papers, X_mean, X_std, theta):
    
    # 1. Create an array of the 5 input features
    input_features = np.array([Hours, Scores, Extra, Sleep, Papers])
    
    # 2. Normalize the features using the 5 stored means/stds
    normalized_features = (input_features - X_mean) / X_std
    
    # 3. Add the bias term (1) at the front
    final_input_vector = np.insert(normalized_features, 0, 1)
    
    # 4. Calculate the prediction: h(x) = X * Theta
    return final_input_vector.dot(theta)

# Example Prediction: 
# (Hours=5, Scores=70, Extra=1 (Yes), Sleep=6, Papers=3)
# Note: Extra=1 corresponds to 'Yes'
predicted_index = predict_performance_index(5, 70, 1, 6, 3, X_mean, X_std, theta_final)
print(f"\nPrediction for [5, 70, 1, 6, 3]: {predicted_index:.2f}")


Prediction for [5, 70, 1, 6, 3]: 0.00
