In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error , mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv("/content/Book1.csv")
data.head(10)

x = data[['Hours']]
y = data['Score']

model = LinearRegression()
model.fit(x,y)

y_pred = model.predict(x)

mse = mean_squared_error(y,y_pred)
mae = mean_absolute_error(y,y_pred)

rmse = np.sqrt(mse)

print("MAE",mae)
print("MSE",mse)
print("RMSE",rmse)

plt.scatter(x,y)
plt.plot(x,y_pred,color='red')
plt.show()

new_prediction = model.predict([[7]])
print("New Prediction",new_prediction)





In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv("StudentsPerformance.csv")

# -------------------
# 1. Visualization
# -------------------
plt.figure(figsize=(12,5))
sns.boxplot(x="lunch", y="math score", data=df)
plt.title("Math Scores by Lunch Type")
plt.show()

plt.figure(figsize=(12,5))
sns.boxplot(x="lunch", y="reading score", data=df)
plt.title("Reading Scores by Lunch Type")
plt.show()

plt.figure(figsize=(12,5))
sns.boxplot(x="lunch", y="writing score", data=df)
plt.title("Writing Scores by Lunch Type")
plt.show()

# -------------------
# 2. Model Training
# -------------------
X = df.drop(columns=["math score", "reading score", "writing score"])
y = df[["math score", "reading score", "writing score"]]

categorical_cols = X.select_dtypes(include="object").columns
preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(drop="first"), categorical_cols)],
    remainder="drop"
)

model = RandomForestRegressor(random_state=42, n_estimators=200)
pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                           ("model", model)])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
pipeline.fit(X_train, y_train)

# -------------------
# 3. Simulation
# -------------------
# Predict on original test data
y_pred = pipeline.predict(X_test)

# Simulate "if all lunch were standard"
X_sim = X_test.copy()
X_sim["lunch"] = "standard"
y_sim = pipeline.predict(X_sim)

# -------------------
# 4. Compare Results
# -------------------
comparison = pd.DataFrame({
    "Original_math": y_test["math score"].values,
    "Pred_math": y_pred[:,0],
    "Sim_math_if_standard": y_sim[:,0],
    "Original_reading": y_test["reading score"].values,
    "Pred_reading": y_pred[:,1],
    "Sim_reading_if_standard": y_sim[:,1],
    "Original_writing": y_test["writing score"].values,
    "Pred_writing": y_pred[:,2],
    "Sim_writing_if_standard": y_sim[:,2],
})

print(comparison.head(20))

# -------------------
# 5. Visualization of improvement
# -------------------
plt.figure(figsize=(10,6))
sns.histplot(comparison["Original_math"], color="red", label="Original Math", kde=True)
sns.histplot(comparison["Sim_math_if_standard"], color="blue", label="If Standard Lunch", kde=True)
plt.legend()
plt.title("Math Scores: Free/Reduced vs Simulated Standard Lunch")
plt.show()


In [None]:
# ----------------------------
# STEP 1: Import Libraries
# ----------------------------
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# ----------------------------
# STEP 2: Load Dataset
# ----------------------------
df = pd.read_csv("StudentsPerformance.csv")
print(df.head())

# ----------------------------
# STEP 3: Exploratory Data Analysis (EDA)
# ----------------------------
# Distribution of lunch vs scores
plt.figure(figsize=(12,5))
sns.boxplot(x="lunch", y="math score", data=df)
plt.title("Math Scores by Lunch Type")
plt.show()

plt.figure(figsize=(12,5))
sns.boxplot(x="lunch", y="reading score", data=df)
plt.title("Reading Scores by Lunch Type")
plt.show()

plt.figure(figsize=(12,5))
sns.boxplot(x="lunch", y="writing score", data=df)
plt.title("Writing Scores by Lunch Type")
plt.show()

# ----------------------------
# STEP 4: Split Features (X) and Targets (y)
# ----------------------------
X = df.drop(columns=["math score", "reading score", "writing score"])
y = df[["math score", "reading score", "writing score"]]

# ----------------------------
# STEP 5: Preprocessing
# ----------------------------
categorical_cols = X.select_dtypes(include="object").columns

preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(drop="first"), categorical_cols)],
    remainder="drop"
)

# ----------------------------
# STEP 6: Define Model (Supervised Learning)
# ----------------------------
model = RandomForestRegressor(
    random_state=42,
    n_estimators=200
)

pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                           ("model", model)])

# ----------------------------
# STEP 7: Train-Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ----------------------------
# STEP 8: Train Model
# ----------------------------
pipeline.fit(X_train, y_train)

# ----------------------------
# STEP 9: Evaluate Model
# ----------------------------
y_pred = pipeline.predict(X_test)

print("\nüìä Model Evaluation:")
print("MAE (Mean Absolute Error):", mean_absolute_error(y_test, y_pred))
print("R¬≤ Score:", r2_score(y_test, y_pred))

# ----------------------------
# STEP 10: Simulate Lunch Effect
# ----------------------------
# Copy test set but force lunch = "standard"
X_sim = X_test.copy()
X_sim["lunch"] = "standard"

y_sim = pipeline.predict(X_sim)

# ----------------------------
# STEP 11: Compare Predictions
# ----------------------------
comparison = pd.DataFrame({
    "Original_math": y_test["math score"].values,
    "Pred_math": y_pred[:,0],
    "Sim_math_if_standard": y_sim[:,0],
    "Original_reading": y_test["reading score"].values,
    "Pred_reading": y_pred[:,1],
    "Sim_reading_if_standard": y_sim[:,1],
    "Original_writing": y_test["writing score"].values,
    "Pred_writing": y_pred[:,2],
    "Sim_writing_if_standard": y_sim[:,2],
})

print("\nüîç First 10 rows of comparison:")
print(comparison.head(10))

# ----------------------------
# STEP 12: Visualize Improvement
# ----------------------------
plt.figure(figsize=(10,6))
sns.histplot(comparison["Original_math"], color="red", label="Original Math", kde=True)
sns.histplot(comparison["Sim_math_if_standard"], color="blue", label="If Standard Lunch", kde=True)
plt.legend()
plt.title("Math Scores Distribution: Original vs If Lunch=Standard")
plt.show()

plt.figure(figsize=(10,6))
sns.histplot(comparison["Original_reading"], color="red", label="Original Reading", kde=True)
sns.histplot(comparison["Sim_reading_if_standard"], color="blue", label="If Standard Lunch", kde=True)
plt.legend()
plt.title("Reading Scores Distribution: Original vs If Lunch=Standard")
plt.show()

plt.figure(figsize=(10,6))
sns.histplot(comparison["Original_writing"], color="red", label="Original Writing", kde=True)
sns.histplot(comparison["Sim_writing_if_standard"], color="blue", label="If Standard Lunch", kde=True)
plt.legend()
plt.title("Writing Scores Distribution: Original vs If Lunch=Standard")
plt.show()

# ----------------------------
# STEP 13: Average Score Improvement
# ----------------------------
print("\nüìà Average Improvement if Lunch ‚Üí Standard:")
print("Math   +", (comparison["Sim_math_if_standard"] - comparison["Pred_math"]).mean())
print("Reading+", (comparison["Sim_reading_if_standard"] - comparison["Pred_reading"]).mean())
print("Writing+", (comparison["Sim_writing_if_standard"] - comparison["Pred_writing"]).mean())


In [None]:
# ----------------------------
# STEP 1: Import Libraries
# ----------------------------
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# ----------------------------
# STEP 2: Load Dataset
# ----------------------------
df = pd.read_csv("StudentsPerformance.csv")
print(df.head())

# ----------------------------
# STEP 3: Exploratory Data Analysis (EDA)
# ----------------------------
# Distribution of lunch vs scores
plt.figure(figsize=(12,5))
sns.boxplot(x="lunch", y="math score", data=df)
plt.title("Math Scores by Lunch Type")
plt.show()

plt.figure(figsize=(12,5))
sns.boxplot(x="lunch", y="reading score", data=df)
plt.title("Reading Scores by Lunch Type")
plt.show()

plt.figure(figsize=(12,5))
sns.boxplot(x="lunch", y="writing score", data=df)
plt.title("Writing Scores by Lunch Type")
plt.show()

# ----------------------------
# STEP 4: Split Features (X) and Targets (y)
# ----------------------------
X = df.drop(columns=["math score", "reading score", "writing score"])
y = df[["math score", "reading score", "writing score"]]

# ----------------------------
# STEP 5: Preprocessing
# ----------------------------
categorical_cols = X.select_dtypes(include="object").columns

preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(drop="first"), categorical_cols)],
    remainder="drop"
)

# ----------------------------
# STEP 6: Define Model (Supervised Learning)
# ----------------------------
model = RandomForestRegressor(
    random_state=42,
    n_estimators=200
)

pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                           ("model", model)])

# ----------------------------
# STEP 7: Train-Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ----------------------------
# STEP 8: Train Model
# ----------------------------
pipeline.fit(X_train, y_train)

# ----------------------------
# STEP 9: Evaluate Model
# ----------------------------
y_pred = pipeline.predict(X_test)

print("\nüìä Model Evaluation:")
print("MAE (Mean Absolute Error):", mean_absolute_error(y_test, y_pred))
print("R¬≤ Score:", r2_score(y_test, y_pred))

# ----------------------------
# STEP 10: Simulate Lunch Effect
# ----------------------------
# Copy test set but force lunch = "standard"
X_sim = X_test.copy()
X_sim["lunch"] = "standard"

y_sim = pipeline.predict(X_sim)

# ----------------------------
# STEP 11: Compare Predictions
# ----------------------------
comparison = pd.DataFrame({
    "Original_math": y_test["math score"].values,
    "Pred_math": y_pred[:,0],
    "Sim_math_if_standard": y_sim[:,0],
    "Original_reading": y_test["reading score"].values,
    "Pred_reading": y_pred[:,1],
    "Sim_reading_if_standard": y_sim[:,1],
    "Original_writing": y_test["writing score"].values,
    "Pred_writing": y_pred[:,2],
    "Sim_writing_if_standard": y_sim[:,2],
})

print("\nüîç First 10 rows of comparison:")
print(comparison.head(10))

# ----------------------------
# STEP 12: Visualize Improvement
# ----------------------------
plt.figure(figsize=(10,6))
sns.histplot(comparison["Original_math"], color="red", label="Original Math", kde=True)
sns.histplot(comparison["Sim_math_if_standard"], color="blue", label="If Standard Lunch", kde=True)
plt.legend()
plt.title("Math Scores Distribution: Original vs If Lunch=Standard")
plt.show()

plt.figure(figsize=(10,6))
sns.histplot(comparison["Original_reading"], color="red", label="Original Reading", kde=True)
sns.histplot(comparison["Sim_reading_if_standard"], color="blue", label="If Standard Lunch", kde=True)
plt.legend()
plt.title("Reading Scores Distribution: Original vs If Lunch=Standard")
plt.show()

plt.figure(figsize=(10,6))
sns.histplot(comparison["Original_writing"], color="red", label="Original Writing", kde=True)
sns.histplot(comparison["Sim_writing_if_standard"], color="blue", label="If Standard Lunch", kde=True)
plt.legend()
plt.title("Writing Scores Distribution: Original vs If Lunch=Standard")
plt.show()

# ----------------------------
# STEP 13: Average Score Improvement
# ----------------------------
print("\nüìà Average Improvement if Lunch ‚Üí Standard:")
print("Math   +", (comparison["Sim_math_if_standard"] - comparison["Pred_math"]).mean())
print("Reading+", (comparison["Sim_reading_if_standard"] - comparison["Pred_reading"]).mean())
print("Writing+", (comparison["Sim_writing_if_standard"] - comparison["Pred_writing"]).mean())
