In [None]:
pip install pandas numpy scikit-learn


In [None]:
import pandas as pd
url = 'https://drive.google.com/uc?id=1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX&export=download'
data = pd.read_csv(url)
print(data.head())

In [None]:
# Separate features and target variable
X = data.drop(columns=['Price'])
y = data['Price']


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

betas = []
r2_scores = []

# Iterate through each fold
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict values
    y_pred = model.predict(X_test)

    # Calculate R2 score
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    betas.append(model.coef_)

# Find the best beta matrix (with maximum R2 score)
best_index = np.argmax(r2_scores)
best_beta = betas[best_index]
best_r2 = r2_scores[best_index]

print(f"Best R2 Score: {best_r2}")
print(f"Best Beta Matrix: {best_beta}")


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train (70%) and test (30%)
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train the final model
final_model = LinearRegression()
final_model.fit(X_train_final, y_train_final)

# Predict and calculate R2 score on the test set
final_predictions = final_model.predict(X_test_final)
final_r2_score = r2_score(y_test_final, final_predictions)

print(f"Final R2 Score on Test Data: {final_r2_score}")


In [None]:
#q2
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# Load the dataset
url = 'https://drive.google.com/uc?id=1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX'
data = pd.read_csv(url)

# Separate features and target variable
X = data.drop(columns=['Price'])
y = data['Price']

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training (56%), validation (14%), and test (30%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.44, random_state=42)  # 44% for validation and test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.68, random_state=42)  # 30% test, 14% validation

# Function to perform gradient descent
def gradient_descent(X, y, learning_rate, iterations):
    m, n = X.shape
    beta = np.zeros(n)
    for _ in range(iterations):
        predictions = X.dot(beta)
        errors = predictions - y
        beta -= (learning_rate / m) * (X.T.dot(errors))
    return beta

# Learning rates
learning_rates = [0.001, 0.01, 0.1, 1]
iterations = 1000

best_beta = None
best_r2_val = -np.inf
best_r2_test = -np.inf

# Train and evaluate for each learning rate
for lr in learning_rates:
    # Perform gradient descent
    beta = gradient_descent(X_train, y_train, lr, iterations)

    # Predict on validation and test sets
    val_predictions = X_val.dot(beta)
    test_predictions = X_test.dot(beta)

    # Compute R² scores
    r2_val = r2_score(y_val, val_predictions)
    r2_test = r2_score(y_test, test_predictions)

    print(f"Learning Rate: {lr}")
    print(f"Beta Coefficients: {beta}")
    print(f"Validation R² Score: {r2_val}")
    print(f"Test R² Score: {r2_test}")

    # Update best coefficients if validation R² is higher
    if r2_val > best_r2_val:
        best_r2_val = r2_val
        best_r2_test = r2_test
        best_beta = beta

# Display the best results
print("\nBest Model:")
print(f"Best Beta Coefficients: {best_beta}")
print(f"Best Validation R² Score: {best_r2_val}")
print(f"Best Test R² Score: {best_r2_test}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA

# Step 1: Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
column_names = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
                "num_doors", "body_style", "drive_wheels", "engine_location",
                "wheel_base", "length", "width", "height", "curb_weight",
                "engine_type", "num_cylinders", "engine_size", "fuel_system",
                "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
                "city_mpg", "highway_mpg", "price"]

data = pd.read_csv(url, names=column_names)
data.replace('?', np.nan, inplace=True)

# Step 2: Handle missing values
# Convert relevant columns to numeric
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Fill NaN values with the mean for numeric columns
numeric_cols = data.select_dtypes(include=[np.number]).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

# Drop rows where 'price' is NaN
data.dropna(subset=['price'], inplace=True)

# Step 3: Convert non-numeric values
# (i) Convert num_doors and num_cylinders
data['num_doors'].replace({'two': 2, 'four': 4}, inplace=True)
data['num_doors'] = pd.to_numeric(data['num_doors'], errors='coerce')

data['num_cylinders'].replace({'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'eight': 8}, inplace=True)
data['num_cylinders'] = pd.to_numeric(data['num_cylinders'], errors='coerce')

# (ii) Dummy encoding for body_style and drive_wheels
data = pd.get_dummies(data, columns=['body_style', 'drive_wheels'], drop_first=True)

# (iii) Label encoding for make, aspiration, engine_location, and fuel_type
label_cols = ['make', 'aspiration', 'engine_location', 'fuel_type']
label_encoder = LabelEncoder()
for col in label_cols:
    data[col] = label_encoder.fit_transform(data[col])

# (iv) Replace fuel_system values
data['fuel_system'] = data['fuel_system'].apply(lambda x: 1 if 'pfi' in str(x) else 0)

# (v) Replace engine_type values
data['engine_type'] = data['engine_type'].apply(lambda x: 1 if 'ohc' in str(x) else 0)

# Step 4: Prepare input features and scale
X = data.drop(columns=['price']).astype(float)  # Features
y = data['price'].astype(float)  # Target variable

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 5: Train and test linear regression model
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train the model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = lin_reg.predict(X_test)
initial_r2 = r2_score(y_test, y_pred)
print(f"Initial R² Score: {initial_r2}")

# Step 6: Apply PCA
pca = PCA(n_components=0.95)  # Keep 95% of variance
X_pca = pca.fit_transform(X_train)

# Train a new model with PCA-reduced data
lin_reg_pca = LinearRegression()
lin_reg_pca.fit(X_pca, y_train)

# Transform the test set
X_test_pca = pca.transform(X_test)

# Make predictions and evaluate
y_pred_pca = lin_reg_pca.predict(X_test_pca)
pca_r2 = r2_score(y_test, y_pred_pca)
print(f"R² Score after PCA: {pca_r2}")
