In [None]:
import pandas as pd

# Load the Excel file
file_path = r'C:\Users\Dell\Desktop\ML\flask copy\main.output.xlsx'
df = pd.read_excel(file_path)

# Display basic information about the dataset
print("Dataset Information:")
print(df.info())

# Display descriptive statistics of numerical features
print("\nDescriptive Statistics of Numerical Features:")
print(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Check for outliers
# You can visualize the distribution of numerical features or use statistical methods like Z-score or IQR to detect outliers

# Explore the existing features and their relationship with the target variables
# You can use scatter plots, histograms, or correlation matrices to visualize relationships

# Now, let's proceed with feature engineering
# For demonstration purposes, let's say we want to create interaction terms and polynomial features

# Create interaction terms
interaction_terms = pd.DataFrame()
for i in range(len(df.columns)):
    for j in range(i+1, len(df.columns)):
        interaction_terms[f"{df.columns[i]}_{df.columns[j]}"] = df[df.columns[i]] * df[df.columns[j]]

# Create polynomial features (squared terms)
polynomial_features = pd.DataFrame()
for column in df.columns:
    polynomial_features[f"{column}_squared"] = df[column] ** 2

# Concatenate the original dataframe with the new features
df = pd.concat([df, interaction_terms, polynomial_features], axis=1)

# Now you can proceed with model training using the augmented dataset with new features


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from math import sqrt
import joblib

# Load the Excel file
file_path = r'C:\Users\Dell\Desktop\ML\flask copy\main.output.xlsx'
df = pd.read_excel(file_path)

# Define input and output columns
input_columns = [
    'Technology Acceptance',
    'Level of use of AI based tools',
    'Technology based Tutoring System',
    'Organisational Performance',
    'Student\'s Performance'
]

output_columns = [
    'Technology_Acceptance_Range',
    'Level_of_use_of_AI_based_tools_Range',
    'Technology_based_Tutoring_System_Range',
    'Organisational_Performance_Range',
    'Student\'s_Performance_Range'
]

# Extract input and output data
X = df[input_columns].values
y = df[output_columns].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Selection and Hyperparameter Tuning (Random Forest Regressor)
rf_regressor = RandomForestRegressor(random_state=42)
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15]
}
rf_grid_search = GridSearchCV(estimator=rf_regressor, param_grid=rf_param_grid, cv=5)
rf_grid_search.fit(X_train, y_train)

# Best Random Forest Regressor model
rf_best_model = rf_grid_search.best_estimator_

# Predict on the testing set using Random Forest Regressor
y_pred_rf = rf_best_model.predict(X_test)

# Evaluate Random Forest Regressor model
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = sqrt(mse_rf)

print("Random Forest Regressor Metrics:")
print(f'R-squared: {r2_rf}')
print(f'Mean Absolute Error: {mae_rf}')
print(f'Mean Squared Error: {mse_rf}')
print(f'Root Mean Squared Error: {rmse_rf}')

# Save the best Random Forest Regressor model
rf_model_filename = 'random_forest_model.joblib'
joblib.dump(rf_best_model, rf_model_filename)

print(f'Random Forest Regressor model saved as {rf_model_filename}')
