# WE WILL CREATE OUR OWN DATA SET BY USING PANDAS AND NUMPY

In [20]:
import pandas as pd
import numpy as np

In [21]:
# Set random seed for reproducibility
np.random.seed(42)


In [22]:
# Create 100 rows of dummy data
rows = 100
data = {
    "CGPA": np.round(np.random.uniform(5.5, 9.5, rows), 2),
    "Backlogs": np.random.randint(0, 4, rows),
    "Internships": np.random.randint(0, 3, rows),
    "Projects": np.random.randint(1, 5, rows),
    "ExtraCurricular": np.random.randint(0, 2, rows),
    "ProgrammingSkill": np.random.randint(3, 11, rows),
    "MockTestScore": np.random.randint(40, 100, rows),
    "CommunicationSkill": np.random.randint(4, 11, rows),
}

In [23]:
# Simple logic to simulate placements
df = pd.DataFrame(data)
df["Placed"] = ((df["CGPA"] > 7.0) & (df["MockTestScore"] > 70) & (df["ProgrammingSkill"] > 6)).astype(int)


In [24]:
df.to_csv("placement_data.csv", index=False)
print("✅ placement_data.csv created successfully.")


✅ placement_data.csv created successfully.


In [25]:
df.head()

Unnamed: 0,CGPA,Backlogs,Internships,Projects,ExtraCurricular,ProgrammingSkill,MockTestScore,CommunicationSkill,Placed
0,7.0,2,0,2,1,7,49,7,0
1,9.3,3,0,1,0,8,65,4,0
2,8.43,2,2,3,1,7,73,5,1
3,7.89,0,1,2,1,9,90,5,1
4,6.12,3,1,4,1,5,80,7,0


In [26]:
df.tail()

Unnamed: 0,CGPA,Backlogs,Internships,Projects,ExtraCurricular,ProgrammingSkill,MockTestScore,CommunicationSkill,Placed
95,7.48,3,2,2,1,3,68,9,0
96,7.59,1,1,3,0,4,82,10,0
97,7.21,2,1,1,0,10,50,10,0
98,5.6,0,1,4,1,7,57,4,0
99,5.93,3,1,1,0,4,86,8,0


In [27]:
df.isnull().sum()

CGPA                  0
Backlogs              0
Internships           0
Projects              0
ExtraCurricular       0
ProgrammingSkill      0
MockTestScore         0
CommunicationSkill    0
Placed                0
dtype: int64

In [28]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CGPA                100 non-null    float64
 1   Backlogs            100 non-null    int32  
 2   Internships         100 non-null    int32  
 3   Projects            100 non-null    int32  
 4   ExtraCurricular     100 non-null    int32  
 5   ProgrammingSkill    100 non-null    int32  
 6   MockTestScore       100 non-null    int32  
 7   CommunicationSkill  100 non-null    int32  
 8   Placed              100 non-null    int64  
dtypes: float64(1), int32(7), int64(1)
memory usage: 4.4 KB


In [29]:
df.describe()

Unnamed: 0,CGPA,Backlogs,Internships,Projects,ExtraCurricular,ProgrammingSkill,MockTestScore,CommunicationSkill,Placed
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,7.3805,1.49,1.04,2.45,0.56,6.4,71.29,7.03,0.2
std,1.190258,1.167705,0.8278,1.14922,0.498888,2.215647,16.05187,1.987105,0.402015
min,5.52,0.0,0.0,1.0,0.0,3.0,43.0,4.0,0.0
25%,6.27,0.0,0.0,1.0,0.0,4.0,58.0,5.0,0.0
50%,7.355,1.5,1.0,2.0,1.0,6.0,72.0,7.0,0.0
75%,8.4225,3.0,2.0,3.25,1.0,8.0,86.25,9.0,0.0
max,9.45,3.0,2.0,4.0,1.0,10.0,98.0,10.0,1.0


In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [31]:
# Load the data
df = pd.read_csv("placement_data.csv")


In [32]:
df.head()

Unnamed: 0,CGPA,Backlogs,Internships,Projects,ExtraCurricular,ProgrammingSkill,MockTestScore,CommunicationSkill,Placed
0,7.0,2,0,2,1,7,49,7,0
1,9.3,3,0,1,0,8,65,4,0
2,8.43,2,2,3,1,7,73,5,1
3,7.89,0,1,2,1,9,90,5,1
4,6.12,3,1,4,1,5,80,7,0


In [33]:
# Features and target
X = df.drop("Placed", axis=1)
y = df["Placed"]  # this is binary but we'll treat it as probability

In [34]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [35]:
# Initialize the model
model = LinearRegression()
model.fit(X_train, y_train)

In [36]:

# Predict on test set
y_pred = model.predict(X_test)


In [37]:
# Evaluation
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

R² Score: -1.1956048751640642
MSE: 0.10429123157029303


In [38]:
import joblib

In [40]:

# Save column names
joblib.dump(list(X.columns), "placement_features.pkl")

['placement_features.pkl']

In [41]:
# Save the model
joblib.dump(model, "placement_predictor_regression.pkl")


['placement_predictor_regression.pkl']

In [42]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np


In [43]:
# y_test = true values
# y_pred = predicted values from the model

# R² Score
r2 = r2_score(y_test, y_pred)

# Mean Squared Error
mse = mean_squared_error(y_test, y_pred)

# Root Mean Squared Error
rmse = np.sqrt(mse)

# Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)

In [44]:
# Print all metrics
print(f"R² Score: {r2}")
print(f"MSE (Mean Squared Error): {mse}")
print(f"RMSE (Root Mean Squared Error): {rmse}")
print(f"MAE (Mean Absolute Error): {mae}")

R² Score: -1.1956048751640642
MSE (Mean Squared Error): 0.10429123157029303
RMSE (Root Mean Squared Error): 0.3229415296463015
MAE (Mean Absolute Error): 0.25729489630467495


# 📉 Model Evaluation 
Hey there! So after training and testing the model, here’s how well it performed:

R² Score: -1.19
This is not good. A negative R² means the model is doing worse than if we just predicted the average for every value. It's like giving random answers instead of informed ones.

MSE (Mean Squared Error): 0.104
This shows how far off our predictions are, on average (squared). Smaller is better, but this alone doesn’t tell the full story.

RMSE (Root Mean Squared Error): 0.32
On average, the model is off by about 0.32 units from the real value. Again, not ideal.

MAE (Mean Absolute Error): 0.25
This means the model predictions are off by 0.25 units on average, which isn’t horrible, but still not great.

In [None]:
# What This Tells Us
Basically, our model isn't capturing the pattern in the data well. It’s not able to make accurate predictions, and we need to improve it if we want it to be useful.



 # ✅ How We Can Improve It:

Add Better Features
Your current inputs (columns) may not be enough. Try adding info like CGPA, internships, number of projects, etc. — anything that could influence the outcome.

Try Stronger Models
Instead of Linear Regression, try using models like RandomForestRegressor or XGBoostRegressor. They’re smarter and often work better with real-world data.

Scale the Data
If your input values are on very different scales (e.g., age vs marks), use StandardScaler() to balance them.

Visualize Predictions
Plot a graph of actual vs predicted values. That will help you see how far off the model is and where it’s struggling.

Clean the Data
Make sure there are no missing values, wrong entries, or outliers (weird values that don’t belong).

