In [None]:
#eda

In [None]:
import pandas as pd

# Read the CSV files into separate DataFrames
table1 = pd.read_csv('Table1.csv')
table2 = pd.read_csv('Table2.csv')
table3 = pd.read_csv('Table3.csv')

# Merge DataFrames using appropriate columns
merged_df = pd.merge(table1, table2, left_on='Style code/Product code', right_on='Style code/Product code', how='inner')
merged_df = pd.merge(merged_df, table3, left_on='Product', right_on='Product', how='inner')

# Display the merged DataFrame
print(merged_df.head())

In [None]:
# Basic information about the merged DataFrame
print(merged_df.info())

# Summary statistics
print(merged_df.describe())

# Check for missing values
print(merged_df.isnull().sum())

# Unique values in specific columns
print(merged_df['Category'].unique())
print(merged_df['Size'].unique())

# Visualizations (using matplotlib or seaborn)
import matplotlib.pyplot as plt
import seaborn as sns

# Example: Histogram of Price
plt.figure(figsize=(8, 6))
sns.histplot(merged_df['Price'], bins=20, kde=True)
plt.title('Distribution of Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

# Example: Boxplot of Star Ratings by Category
plt.figure(figsize=(8, 6))
sns.boxplot(x='Category', y='Star Rating', data=merged_df)
plt.title('Star Ratings by Category')
plt.xlabel('Category')
plt.ylabel('Star Rating')
plt.show()

In [None]:
#Step 1: Initial Data Examination
Basic Information & Structure
Check DataFrame structure, column names, and data types.
Verify if columns have the expected data.

In [None]:
# Display basic information about the merged DataFrame
print(merged_df.info())

# Display the first few rows to get an initial understanding
print(merged_df.head())

In [None]:
#Step 2: Summary Statistics
Descriptive Statistics
Compute basic statistics to understand the central tendency, dispersion, and shape of the data.

In [None]:
# Summary statistics
print(merged_df.describe())

In [None]:
#Step 3: Handling Missing Values
Missing Values Investigation
Check for missing values and decide how to handle them (impute or drop).

In [None]:
# Check for missing values
print(merged_df.isnull().sum())

In [None]:
#Step 4: Exploring Categorical Variables
Categorical Variables Analysis
Explore unique values and their frequencies in categorical columns.
Visualize categorical distributions.

In [None]:
# Unique values in specific columns
print(merged_df['Category'].value_counts())

# Example: Visualizing Category distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Category', data=merged_df)
plt.title('Distribution of Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.show()

In [None]:
#Step 5: Analyzing Numerical Variables
Numerical Variables Analysis
Examine distributions and relationships of numerical variables.
Visualize distributions, correlations, or trends.

In [None]:
# Example: Histogram of Price
plt.figure(figsize=(8, 6))
sns.histplot(merged_df['Price'], bins=20, kde=True)
plt.title('Distribution of Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

# Example: Scatterplot of Price vs. Star Rating
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Price', y='Star Rating', data=merged_df)
plt.title('Price vs. Star Rating')
plt.xlabel('Price')
plt.ylabel('Star Rating')
plt.show()

In [None]:
#Step 6: Relationships and Correlations
Correlation Analysis
Explore relationships between numerical variables.
Calculate correlations and visualize them.

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(merged_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
#Step 7: Advanced Analysis & Insights
Further Analysis for Specific Insights
Analyze specific relationships, patterns, or trends of interest.
Derive insights based on domain-specific knowledge.

In [None]:
# Example: Boxplot of Star Ratings by Category
plt.figure(figsize=(8, 6))
sns.boxplot(x='Category', y='Star Rating', data=merged_df)
plt.title('Star Ratings by Category')
plt.xlabel('Category')
plt.ylabel('Star Rating')
plt.show()

# Additional analysis based on domain-specific requirements
# ...

# Repeat similar analysis or visualizations for other variables of interest
# ...

In [None]:
#Step 8: Conclusions & Observations
Observations and Conclusions
Summarize key findings, trends, or anomalies discovered during EDA.
Derive insights and potential actionable points from the data.

In [None]:
#Step 9: Documentation
Documentation & Reporting
Document the EDA process, findings, visualizations, and conclusions in a report or notebook for future reference or sharing.
This comprehensive EDA aims to unveil patterns, relationships, and anomalies within the data to derive actionable insights. The specific visualizations or analysis might vary based on the data and domain knowledge, but these steps provide a structured approach to understanding the dataset thoroughly.

In [None]:
#phase 4 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge  # Ridge Regression for regularization
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the merged data
merged_df = pd.read_csv('merged_data.csv')

# Separate features and labels for price and star rating
X_price = merged_df.drop(['Price', 'Star Rating'], axis=1)
y_price = merged_df['Price']
X_star = merged_df.drop(['Price', 'Star Rating'], axis=1)
y_star = merged_df['Star Rating']

# Split data into training and test sets for price and star rating
X_train_price, X_test_price, y_train_price, y_test_price = train_test_split(X_price, y_price, test_size=0.2, random_state=42)
X_train_star, X_test_star, y_train_star, y_test_star = train_test_split(X_star, y_star, test_size=0.2, random_state=42)

# Feature scaling
scaler_price = StandardScaler()
X_train_price_scaled = scaler_price.fit_transform(X_train_price)
X_test_price_scaled = scaler_price.transform(X_test_price)

scaler_star = StandardScaler()
X_train_star_scaled = scaler_star.fit_transform(X_train_star)
X_test_star_scaled = scaler_star.transform(X_test_star)

# Price Prediction - Ridge Regression with hyperparameter tuning
param_grid_price = {'alpha': [0.1, 1, 10]}
ridge_price = GridSearchCV(Ridge(), param_grid_price, cv=5)
ridge_price.fit(X_train_price_scaled, y_train_price)
best_alpha_price = ridge_price.best_params_['alpha']
price_model = Ridge(alpha=best_alpha_price)
price_model.fit(X_train_price_scaled, y_train_price)

# Star Rating Prediction - Random Forest Regression with hyperparameter tuning
param_grid_star = {'n_estimators': [100, 200, 300], 'max_depth': [5, 10, 15]}
rf_star = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_star, cv=5)
rf_star.fit(X_train_star_scaled, y_train_star)
best_n_estimators_star = rf_star.best_params_['n_estimators']
best_max_depth_star = rf_star.best_params_['max_depth']
star_model = RandomForestRegressor(n_estimators=best_n_estimators_star, max_depth=best_max_depth_star, random_state=42)
star_model.fit(X_train_star_scaled, y_train_star)

# Predictions
price_pred = price_model.predict(X_test_price_scaled)
star_pred = star_model.predict(X_test_star_scaled)

# Evaluation
price_rmse = mean_squared_error(y_test_price, price_pred, squared=False)
price_r2 = r2_score(y_test_price, price_pred)
star_rmse = mean_squared_error(y_test_star, star_pred, squared=False)
star_r2 = r2_score(y_test_star, star_pred)

print(f"Price Prediction - RMSE: {price_rmse}, R² Score: {price_r2}")
print(f"Star Rating Prediction - RMSE: {star_rmse}, R² Score: {star_r2}")

In [None]:
#saving best model

In [None]:
# ... (Previous code remains the same up to model training and evaluation)

# Select the best model based on R² score
best_model_price = price_model if price_r2 > 0 else None  # Check for positive R² score
best_model_star = star_model if star_r2 > 0 else None  # Check for positive R² score

# Save the best models using joblib or pickle
from joblib import dump

if best_model_price:
    dump(best_model_price, 'best_model_price.joblib')
    print("Best Price Prediction Model Saved")
else:
    print("No valid Price Prediction Model to save")

if best_model_star:
    dump(best_model_star, 'best_model_star.joblib')
    print("Best Star Rating Prediction Model Saved")
else:
    print("No valid Star Rating Prediction Model to save")