In [None]:
# # !pip install pandas openpyx
# # !pip install pandas
# !pip install matplotlib
# !pip install numpy
# !pip install seaborn
# !pip install plotly
# !pip install scikit-learn

: 

In [None]:
# !pip install xgboost
# !pip install imbalanced-learn

In [None]:
# importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as px
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import OrdinalEncoder
from datetime import datetime, timedelta
import openpyxl
import xgboost as xgb


In [None]:
df = pd.read_excel("C:/Users/viren/OneDrive/Desktop/IIT-MADARAS(GUVI)/Industrial Copper Modeling/Copper_Set.xlsx")

In [None]:
df

In [None]:
# verify the number of unique value in each features
for i in list(df.columns):
    print(f"{i}: {df[i].nunique()}")

In [None]:
df.dtypes

In [None]:
# Convert quantity tons to numeric
df['quantity tons'] = pd.to_numeric(df['quantity tons'], errors='coerce')

# Ensure dates are treated as strings to avoid float issues
df['item_date_1'] = df['item_date'].astype(str).str.replace('\.0$', '', regex=True)

df['delivery date_1'] = df['delivery date'].astype(str).str.replace('\.0$', '', regex=True)

# Convert to datetime with correct format
df['item_date_1'] = pd.to_datetime(df['item_date'], format='%Y%m%d', errors='coerce')
df['delivery date_1'] = pd.to_datetime(df['delivery date'], format='%Y%m%d', errors='coerce')

In [None]:
df.head(3)

In [None]:
df.isnull().sum()

In [None]:
# Verify NaN values in 'Material_reference' after replacement in % 
print(np.round(df['material_ref'].isnull().mean()*100, 5),"% of missing values")

In [None]:
df.drop(columns=['id', 'material_ref'], inplace=True)
df

In [None]:
df.describe().T

In [None]:
# Quantity tons and selling price values are not below 0. so we convert to null below 0 values.

df['quantity tons'] = df['quantity tons'].apply(lambda x: np.nan if x<=0 else x)
df['selling_price'] = df['selling_price'].apply(lambda x: np.nan if x<=0 else x)
df.describe().T

In [None]:
# Checking the null values in this dataFrame:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
# Handling null values using median and mode
# identifying the object and numerical columns
object_columns = ['item_date','item_date_1','delivery date','delivery date_1','status']
numerical_columns = ['quantity tons','customer','country','application','thickness','selling_price']

In [None]:
# median - middle value in dataset (asc/desc), mode - value that appears most freqently in dataset
# Fill missing values in object columns with mode
for col in object_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# fill missing values in numerical columns with median
for col in numerical_columns:
    df[col].fillna(df[col].median(), inplace=True)
df.isna().sum()

In [None]:
# Chicking for the unique values of all the columns:

for i in df.columns:
    print(i,":",df[i].nunique())

In [None]:
for col in ['country','status','item type','application']:
    print(col,df[col].unique())
    print('--'*20)

In [None]:
df['status'].unique()

In [None]:
for col in ['status','item type']:
    print(df[col].value_counts())
    print('--'*20)
    

In [None]:
df['status'] = df['status'].str.strip()

In [None]:
# Convertca tegorical columns to numerical using OrdinalEncoder and using map method.
df['status'] = df['status'].map({'Lost': 0,'Won': 1,'Draft': 2,'To be approved': 3,
                                  'Not lost for AM': 4,'Wonderful': 5,'Revised': 6,'Offered': 7,'Offerable': 8})


df.head(3)

In [None]:
df['status'].unique()

In [None]:
df['item type'] = OrdinalEncoder().fit_transform(df[['item type']])

df['item type'].unique()

In [None]:
df['status'].unique()

# Skewness Handling - Feature Scaling(Log Transformation)

In [None]:
# Find outliers - box plot & skewes data - hist plot and violin plot

def plot(df, column):
    plt.figure(figsize=(20,5))
    plt.subplot(1,3,1)
    sns.boxplot(data=df, x=column)
    plt.title(f'Box Plot for {column}')

    plt.subplot(1,3,2)
    sns.histplot(data=df, x=column, kde=True, bins=50)
    plt.title(f'Distribution Plot for {column}')

    plt.subplot(1,3,3)
    sns.violinplot(data=df, x=column)
    plt.title(f'Violin plot for {column}')
    plt.show()

In [None]:
for i in ['quantity tons','customer','country','item type','application','thickness','width', 'selling_price']:
    plot(df, i)

In [None]:
# quantity tons, thickness and selling price data are skewed, so we will apply log transformation to these columns.
df1 = df.copy()
df1['quantity tons_log'] = np.log(df1['quantity tons'])
df1['thickness_log'] = np.log(df1['thickness'])
df1['selling_price_log'] = np.log(df1['selling_price'])
df1

In [None]:
# after log transformation the data are normally distributed and reduced the skewness. [hist plot and violin plot]
for i in ['quantity tons_log','thickness_log','width', 'selling_price_log']:
    plot(df1, i)

# Outleries Handling - InterquartileR ange (IQR) method

In [None]:
df2 = df1.copy()
df2

In [None]:
# Using IQR and clip() methods to handle outliers and add a new column of DataFrame

def outlier(df, column):
    iqr = df[column].quantile(0.75) - df[column].quantile(0.25)
    lower_threshold = df[column].quantile(0.25) - 1.5 * iqr
    upper_threshold = df[column].quantile(0.75) + 1.5 * iqr
    df[column] = df[column].clip(lower_threshold, upper_threshold)
    

In [None]:
# (Ex: lower threshold = 5 and upper threshold = 20)
# aboveu  pper threshold values (>20) are converted tou peer threshold value (20) in features
# below lower threshold values (<5) are converted to lower threshold value (5) in features

outlier(df2,'quantity tons_log')
outlier(df2,'thickness_log')
outlier(df2,'selling_price_log')
outlier(df2,'width')
df2


In [None]:
# Transform the outliers to within range using IQR and clip() methods - box plot

for i in ['quantity tons_log','thickness_log','selling_price_log','width']:
    plot(df2, i)

In [None]:
df2.describe().T

In [None]:
# After add the new columns of 'quantity tons_log', thickness_log','selling_price_log', and 'width', drop the existing columns of 'quantity tons', 'thickness', and 'selling_price' columns.
df3 = df2.drop(columns=['quantity tons', 'thickness', 'selling_price'])
df3

In [None]:
# Need to veryfy any columns are highly correlated using Heatmap. if any columns correlation value >= 0.7 (absolute value), drop the columns

col = ['quantity tons_log','customer','country','status','application','width','product_ref','thickness_log','selling_price_log']
df_heatmap = df3[col].corr()
sns.heatmap(df_heatmap, annot=True)

In [None]:
# The highest value is (0.4 or -0.42) only, So there is no column are highly correelated and no need to drop any columns.

In [None]:
df4 = df3.copy()
df4

In [None]:
# The 'delivery date' is previous date of 'item date'. so this is impossible. delivery date is always greater.
# so finding the difference between 'delivery date' and 'item date' and adding a new column of 'delivery_date_diff' in df4 DataFrame.
df4['delivery_date_diff'] = (df4['delivery date_1'] - df4['item_date_1']).dt.days
df4.head()

In [None]:
# Convert the dat type using pandas
df4['item_date_1'] = pd.to_datetime(df4['item_date_1'])

# split the day, month, and year from 'item_date_1' column and add dataframe (this dataframe is used for modeling)
df4['item_date_day'] = df4['item_date_1'].dt.day
df4['item_date_month'] = df4['item_date_1'].dt.month
df4['item_date_year'] = df4['item_date_1'].dt.year  
df4

In [None]:
# split the non-negative value of 'Data_difference' column in separate dataFrame
df_f1 = df4[df4['delivery_date_diff'] >= 0]

# after split, the index values are unordered. so reset the index  to ascending order from 0
df_f1 = df_f1.reset_index(drop=True)
df_f1

In [None]:
# split the negative value of 'delivery_date_diff' column in another dataframe
df_f2 = df4[df4['delivery_date_diff'] < 0]
df_f2 = df_f2.reset_index(drop=True)
df_f2

In [None]:
# These 16108 values 'delivery date' are lesser than 'item date'.
# First we need to train the ML model using correct 'Delivery date' data (df_f1) and predict the 'dadelivery_date_diff' for 'df_f2' DataFrame. using ML model.

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor


In [None]:
df_f1.columns

In [None]:
# find best algorithm to prediction based on R2, meanabsolute error,mean squared error values

def machine_learning_delivery_date(df, algorithm):

    x = df.drop(columns=['item_date_1', 'delivery date_1', 'delivery_date_diff'], axis=1)
    y = df['delivery_date_diff']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    model = algorithm().fit(x_train, y_train)
    y_pred = model.predict(x_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    metrics = {'Algorithm': algorithm.__name__,
               'R2':r2,
               'Mean Squared Error': mse,
               'Root Mean Squared Error': rmse}
    return metrics

In [None]:
print(machine_learning_delivery_date(df_f1, DecisionTreeRegressor))
print(machine_learning_delivery_date(df_f1, ExtraTreesRegressor))
print(machine_learning_delivery_date(df_f1, RandomForestRegressor))
print(machine_learning_delivery_date(df_f1, AdaBoostRegressor))
print(machine_learning_delivery_date(df_f1, GradientBoostingRegressor))
print(machine_learning_delivery_date(df_f1, XGBRegressor))

In [None]:
# Random forest algorithm is low bias and reduce overfitting comparedtoothers

In [None]:
# Train the model using RandomForestRegressor algorithm and predict the 'delivery_date_diff'.
# 'item_date_1','delivery date_1' :- these columns are non-numerical and cannot passed, so skip the columns in the modeltraining and prediction

def ml_date_difference():

    # Train the model by using correct delivery date (df_f1) dataframe
    x = df_f1.drop(columns=['item_date_1','delivery date_1','delivery_date_diff'], axis=1)
    y = df_f1['delivery_date_diff']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    model = RandomForestRegressor().fit(x_train, y_train)

    y_pred_list = []

    for index, row in df_f2.iterrows():
        input_data = row.drop(['item_date_1', 'delivery date_1', 'delivery_date_diff'])
        y_pred = model.predict([input_data])
        y_pred_list.append(y_pred[0])

    return y_pred_list

In [None]:
# Machine learning model predict the data difference of (df_f2) dataframe
date_difference = ml_date_difference()

In [None]:
print(date_difference)

In [None]:
# Convert float values into integer using list comprehension method.

date_defference1 = [int(round(i,0)) for i in date_difference]
print(date_defference1)

In [None]:
# Add 'delivery_date_diff' column in the datframe
df_f2['delivery_date_diff'] = pd.DataFrame(date_defference1)
df_f2

In [None]:
# calculate delivery date (item_date + delivery_date_diff = delivery date)

def find_delivery_date(item_date, date_difference):

    result_date = item_date + timedelta(days=date_difference)

    delivery_date = result_date.strftime('%Y-%m-%d')
    
    return delivery_date

In [None]:
# find out the delivery date and add to dataframe
df_f2['item_date_1'] = pd.to_datetime(df_f2['item_date_1'])
df_f2['delivery_date'] = df_f2.apply(lambda x: find_delivery_date(x['item_date_1'], x['delivery_date_diff']), axis=1)
df_f2

In [None]:
# Finally concatenate the both dataframe into single dataframe
df_final = pd.concat([df_f1, df_f2], ignore_index=True)
df_final


In [None]:
print(df_final.columns.tolist())

In [None]:
# Split the day, month, and year from 'delivery date_1' column and add into dataframe (This data also help us to prediction)
df_final['delivery date_1'] = pd.to_datetime(df_final['delivery date_1'])
df_final['delivery_date_day'] = df_final['delivery date_1'].dt.day
df_final['delivery_date_month'] = df_final['delivery date_1'].dt.month
df_final['delivery_date_year'] = df_final['delivery date_1'].dt.year  
df_final.drop(columns=['item_date','delivery date','delivery_date','item_date_1','delivery date_1','delivery_date_diff'], inplace=True)
df_final

# Classification Method - Predict Status 

In [None]:
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import pickle


In [None]:
# df_final = pd.read_csv

In [None]:
df_final.head()

In [None]:
# check data types
df_final.dtypes

In [None]:
df_c = df_final.copy()

# filter the status column values only 1 & 0 rows in a new dataFrame ['Won':1 & 'Lost':0]
df_c = df_c[(df_c.status == 1) | (df_c.status == 0)]
df_c

In [None]:
# check no of rows (records) each 1 and 0 in dataframe
df_c['status'].value_counts()

In [None]:
# in status feature, the 'Won', and 'Lost' value difference is very high. so need to oversampling to reduce the difference

x = df_c.drop('status', axis=1)
y = df_c['status']

x_new, y_new = SMOTETomek().fit_resample(x, y)

In [None]:
x.shape, y.shape, x_new.shape, y_new.shape

In [None]:
# Check accuracy of training and testing using metrics
# algorithm.__name__ - it return the algorith name
def machine_learning_classification(x_new, y_new, algorithm):

    x_train, x_test, y_train, y_test = train_test_split(x_new, y_new, test_size=0.2, random_state=42)
    model = algorithm().fit(x_train, y_train)

    y_perd_train = model.predict(x_train)
    y_perd_test = model.predict(x_test)

    accuracy_train = metrics.accuracy_score(y_train, y_perd_train)
    accuracy_test = metrics.accuracy_score(y_test, y_perd_test)

    # algo = str(algorithm).split("'")[1].split(".")[-1]
    accuracy_metrics = {'algorithm': algorithm.__name__,
                        'accuracy_train': accuracy_train,
                        'accuracy_test': accuracy_test}

    return accuracy_metrics


In [None]:
print(machine_learning_classification(x_new, y_new, DecisionTreeClassifier))
print(machine_learning_classification(x_new, y_new, ExtraTreesClassifier))
print(machine_learning_classification(x_new, y_new, RandomForestClassifier))
print(machine_learning_classification(x_new, y_new, AdaBoostClassifier))
print(machine_learning_classification(x_new, y_new, GradientBoostingClassifier))
print(machine_learning_classification(x_new, y_new, XGBClassifier))

In [None]:
# we got goos accuracy after oversampling using SMOTETomek method.
# ExtraTreeclassifier and randomForestClassifier both have good test accuracy, but in training accuracy is overfitting.
# Best performers (generalization): ExtraTreesClassifier (~98.1%), RandomForestClassifier (~97.8%)

# Overfitting suspect: DecisionTreeClassifier (train = 100%)

# Weaker learners: AdaBoostClassifier and GradientBoostingClassifier

# XGBoost: Strong, but not beating ExtraTrees/RandomForest in your dataset

In [None]:
# GridsearchCV is a cross validation function.
# hyper parameter tuning - we give parameter values manually in the algorith to reduce the overfitting issue and get better accuracy.

# so using gridsearchcv method - to pass the multiple values in each parameters and try to evalute all thecombination of values and
# finally return the best accuracy parameter values based on the score.

# example: {'max_depth': 20, max_features: 'sqrt', 'min_samples_leaft': 1, 'min_samples_split':2}
# Note : this process can take long time (avg: 1 hr 15 mins). Please wait be patient.

In [None]:
# refer parameter values: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_new, y_new, test_size=0.2, random_state=42)

param_grid = {'max_depth'        : [2, 5, 10, 20],
              'min_samples_split':[2, 5, 10],
              'min_samples_leaf' :[1, 2, 4],
              'max_features'     : ['sqrt', 'log2']}

grid_search = GridSearchCV(estimator=RandomForestClassifier(),param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train, y_train)

In [None]:
#n -jobs = -1 means it uses the all processorsintheprocess

In [None]:
#evalute all the parameter combinations and return the best parameter based on score
grid_search.best_params_      

In [None]:
grid_search.best_score_

In [None]:
# Passing the parameters in the random forest algorithm and check the accuracy for training and testing

x_train, x_test, y_train, y_test = train_test_split(x_new, y_new, test_size=0.2, random_state=42)

model = RandomForestClassifier(max_depth=20, max_features='sqrt', min_samples_leaf=1, min_samples_split=2).fit(x_train, y_train)
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

accuracy_train = metrics.accuracy_score(y_train, y_pred_train)
accuracy_test = metrics.accuracy_score(y_test, y_pred_test)
accuracy_train, accuracy_test

In [None]:
# Passing the parameters in the random forest algorithm and check the accuracy for training and testing

x_train, x_test, y_train, y_test = train_test_split(x_new, y_new, test_size=0.2, random_state=42)

model = RandomForestClassifier(max_depth=20, max_features='log2', min_samples_leaf=1, min_samples_split=2).fit(x_train, y_train)
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

accuracy_train = metrics.accuracy_score(y_train, y_pred_train)
accuracy_test = metrics.accuracy_score(y_test, y_pred_test)
accuracy_train, accuracy_test

In [None]:
# Now the training accuracy overfitting reduced.so now model will predict effectively for unseen data

In [None]:
# predict the status and check the accuracy using metrics
x_train, x_test, y_train, y_test = train_test_split(x_new, y_new, test_size=0.2, random_state=42)

model = RandomForestClassifier(max_depth=20, max_features='sqrt', min_samples_leaf=1, min_samples_split=2).fit(x_train, y_train)
y_pred = model.predict(x_train)

print(confusion_matrix(y_true=y_test, y_pred=y_pred))
print(classification_report(y_true=y_test,y_pred=y_pred))


In [None]:
# # find outliers - box plot & skewed data - hist plot and violin plot

# def skewness_plot(df, *column):
#     number_row = len(column)
#     plot_no=0
#     for col_name in column:
#         if 'log' in col_name or 'sqrt' in col_name or 'boxcox' in col_name:
#             title = "After Transformation"
#         else:
#             title = "Before Transformation"

#         plt.figure(figsize=(18,18))

#         plot_no+= 1
#         plt.subplot(number_row,3, plot_no)
#         sns.boxplot(x=col_name, data=df)
#         plt.title('Boxplot - '+ title)

#         plot_no += 1
#         plt.subplot(number_row, 3, plot_no)
#         sns.histplot(df[col_name], bins=30, edgecolor='black')
#         plt.title(f'Histogram - Skewness: {df[col_name].skew():.2f}')

#         plot_no+=1
#         plt.subplot(number_row,3, plot_no)
#         sns.violinplot(x=col_name, data=df)
#         plt.title('Violinplot -'+ title)

#     plt.tight_layout()
#     return plt.show()

In [None]:
# Specify numerical columns for analysis
numerical_columns = ['quantity tons','width','thickness','selling_price']
# calling the Skewness_plot function
skewness_plot(df, *numerical_columns)


In [None]:
df1 = df.copy()

In [None]:
# quantity tons, thickness and selling price data are skewd. so using the log transformation method to handle the skewness data

def Log_Transformation(df, *column):

    for col_name in column:
        # Log transformation
        df[col_name+'_log'] = np.log1p(df[col_name])

    column = [i for i in df.columns if 'log' in i]

    return skewness_plot(df, * column)

In [None]:
Log_Transformation(df1, *numerical_columns)

In [None]:
column_name = ['quantity tons','thickness','width','selling_price','quantity tons_log','thickness_log','selling_price_log']
sns.heatmap(df1[column_name].corr(), annot=True)

# Outliers Handling - Inrequartile Range (IQR) method

In [None]:
def outlier_plot(df):

    plt.figure(figsize=(16,10))

    plt.subplot(2, 2, 1)
    sns.boxplot(x='width', data=df)
    plt.title('BoxPlot - width')

    plt.subplot(2, 2, 2)
    sns.boxplot(x='quantity tons_log', data=df)
    plt.title('BoxPlot - quantity tons_log')

    plt.subplot(2, 2, 3)
    sns.boxplot(x='thickness_log', data=df)
    plt.title('BoxPlot - ' + 'thickness_log')

    plt.subplot(2, 2, 4)
    sns.boxplot(x='selling_price_log', data=df)
    plt.title('BoxPlot - ' + 'selling_price_log')
    plt.tight_layout()

    return plt.show()

In [None]:
outlier_plot(df1)

In [None]:
df1

In [None]:
# Using IQR and clip() methods to handel the outliers and add a new column of DataFrame 

def outlier(df, column):
    iqr = df[column].quantile(0.75) - df[column].quantile(0.25)
    upper_threshold = df[column].quantile(0.75) + (1.5*iqr)
    lower_threshold = df[column].quantile(0.25) - (1.5*iqr)
    df[column] = df[column].clip(lower_threshold, upper_threshold)

In [None]:
# (Ex: lower threshold = 5 and upeer threshold = 20)
# above upeer threshold values (>20) are converted to upeer threshold value (20) in features
# below lower threshold values (<5) are converted to lower threshold value (5) in features

outlier(df1, 'quantity tons_log')
outlier(df1, 'thickness_log')
outlier(df1, 'selling_price_log')
outlier(df1, 'width')

In [None]:
outlier_plot(df1)

In [None]:
outlier_handle_col = ['quantity tons_log', 'width', 'thickness_log', 'selling_price_log']
skewness_plot(df1, *outlier_handle_col)


In [None]:
df1.head()

In [None]:
column_name = ['quantity tons','thickness', 'width', 'selling_price', 'quantity tons_log', 'thickness_log', 'selling_price_log']
sns.heatmap(df1[column_name].corr(), annot=True )

In [None]:
df1.drop(columns=['width_log'], inplace=True)
df1

In [None]:
df1.dtypes

In [None]:
df2 = df1.copy()

# Find the difference between item and delivery date and add the new column in dataframe
df2['day_difference'] = (pd.to_datetime(df2['delivery date']) - pd.to_datetime(df2['item_date'])).dt.days

In [None]:
df2['item_date'] = pd.to_datetime(df2['item_date'], format='%y%m%d')
df2['delivery date'] = pd.to_datetime(df2['delivery date'], format='%y%m%d')

df2['item_date_day'] = df2['item_date'].dt.day
df2['item_date_month'] = df2['item_date'].dt.month
df2['item_date_year'] = df2['item_date'].dt.year
df2

In [None]:
# split the non-negative value of 'Date_difference' column in separate dataFrame

non_negative_delivery_date_df = df2[df2['day_difference'] > 0]

# split negative value od 'Date_difference' column in another dataframe

negative_delivery_date_df = df2[df2['day_difference'] <= 0]

non_negative_delivery_date_df.shape, negative_delivery_date_df.shape

In [None]:
# make a copy of that

correct_delivery_date = non_negative_delivery_date_df.copy()
inconsistance_delivery_date = negative_delivery_date_df.copy()


In [None]:
from sklearn.ensemble import (AdaBoostRegressor,
                              RandomForestRegressor,
                              ExtraTreesRegressor,
                              GradientBoostingRegressor,
                              HistGradientBoostingRegressor
                              )
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
class Regression:

    def __init__(self, x, y):
        self.models = {
            'AdaboostRegressor': AdaBoostRegressor(),
            'RandomForestRegressor': RandomForestRegressor(),
            'ExtraTreesRegressor': ExtraTreesRegressor(),
            'GradientBoostingRegressor': GradientBoostingRegressor(),
            'HistGradientBoostingRegressor': HistGradientBoostingRegressor(),
            'DecisionTreeRegressor' : DecisionTreeRegressor(),
            'XGBRegressor': XGBRegressor()
        }
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    def evaluate_model(self, model_name, model):
        y_train_pred = model.fit(self.x_train, self.y_train).predict(self.x_train)
        y_test_pred = model.predict(self.x_test)

        result = {
            'Model_Name': model_name,
            'Train_Mean_Square_Error': f'{mean_squared_error(self.y_train, y_train_pred): .4e}',
            'Test_Mean_Square_Error': f'{mean_squared_error(self.y_test, y_test_pred): .4e}',
            'Train_Mean_Absolute_Error': round(mean_absolute_error(self.y_train, y_train_pred), 4),
            'Test_Mean_Absolute_Error': round(mean_absolute_error(self.y_test, y_test_pred), 4),
            'Train_R2_Score': f'{r2_score(self.y_train, y_train_pred): .4e}',
            'Test_R2_Score': round(r2_score(self.y_test, y_test_pred), 4),
        }

        return result

    def all_models(self):
        # Train and evalute each model
        results = [self.evaluate_model(model_name, model) for model_name, model in self.models.items()]

        self.model_score_df = pd.DataFrame(results)

        return self.model_score_df


In [None]:
# categorical to numerical conversion

enc = OrdinalEncoder()
for col in ['delivery date','item_date', 'status','item type']:
    correct_delivery_date[col] = enc.fit_transform(correct_delivery_date[[col]])
    inconsistance_delivery_date[col] = enc.fit_transform(inconsistance_delivery_date[[col]])

In [None]:
# split to asign the independend and depentend features in correct_delivery_date

y = correct_delivery_date['day_difference']
x = correct_delivery_date.drop(['day_difference','item_date', 'delivery date'], axis= 1)

In [None]:
# Train models using the 'regression_method

models = Regression(x, y)
results_df = models.all_models()
results_df


In [None]:
def train_and_evalute(x, y, inconsistent_data):
    # Split the original dataset into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    # Train a RandomForestRegressor
    model = RandomForestRegressor(random_state=42)
    model.fit(x_train, y_train)

    # predictions on training and testing sets
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Calculate mean Square errors
    mse_train = mean_squared_error(y_train, y_train_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)

    print(f'Train_Mean_Square_Error:{round(mse_train, 4)}')
    print(f'Test_Mean_Square_Error: {round(mse_test, 4)}')

    # Extract features for inconsistent delivery date data
    y_new = inconsistent_data['day_difference']
    x_new = inconsistent_data.drop(['day_difference','item_date', 'delivery date'],axis = 1)

    # Predict day_difference for inconsistent delivery dates
    day_pred = model.predict(x_new)

    return day_pred


In [None]:
day_pred = train_and_evalute(x, y, inconsistance_delivery_date)

In [None]:
# Update 'day_difference' in the original Dataframe

negative_delivery_date_df['day_difference'] = day_pred

In [None]:
negative_delivery_date_df.head()

In [None]:
# update "delivery_date" with the help of 'day_defference' in the original DataFrame
negative_delivery_date_df['delivery date'] = negative_delivery_date_df['item_date'] + pd.to_timedelta(negative_delivery_date_df['day_difference'], unit='d')

negative_delivery_date_df.head()

In [None]:
# Take  a copy for the purpose of saving the data
sample = negative_delivery_date_df.copy()

In [None]:
negative_delivery_date_df['item_date'] = pd.to_datetime(negative_delivery_date_df['item_date'])

# Update the 'delivery date' using apply and a lambda function
negative_delivery_date_df['delivery date'] = negative_delivery_date_df.apply(
    lambda row: row['item_date'] + pd.Timedelta(days=row['day_difference']), axis=1
)
negative_delivery_date_df.head()

In [None]:
final_copper_data = pd.concat([non_negative_delivery_date_df, negative_delivery_date_df],axis=0, ignore_index=True)

# Extract day, mont, and year components from 'delivery date' 
final_copper_data['delivery date'] = pd.to_datetime(final_copper_data['delivery date'])
final_copper_data['delivery_date_day'] = final_copper_data['delivery date'].dt.day
final_copper_data['delivery_date_month'] = final_copper_data['delivery date'].dt.month
final_copper_data['delivery_date_year'] = final_copper_data['delivery date'].dt.year
final_copper_data.head()

In [None]:
# Drop 'item_date' and 'delivery date' columns 
final_copper_data.drop(['item_date','delivery date'], axis=1, inplace=True)
final_copper_data


In [None]:
final_copper_data.to_csv("final_copper_data.csv", index=False)

# Classification method - Predict Status  

In [None]:
df = pd.read_csv(r"C:\Users\viren\OneDrive\Desktop\IIT-MADARAS(GUVI)\Industrial Copper Modeling\Industrial-Copper-Modeling\final_copper_data.csv")
final_copper_data =pd.DataFrame(df)
final_copper_data

In [None]:
# Filter rows where 'Status' is either 'Won' or 'Lost'
final_data = final_copper_data[(final_copper_data['status'] == 'Won') | (final_copper_data['status'] == 'Lost')]
final_data

In [None]:
print(final_data['status'].unique())
print(final_data['item type'].unique())

In [None]:
final_data.status.value_counts()

In [None]:
from sklearn.preprocessing import OrdinalEncoder
columns_to_encode = ['status','item type']

for col_name in columns_to_encode:
    encoder = OrdinalEncoder()
    final_data[col_name] = encoder.fit_transform(final_data[[col_name]])

In [None]:
print(final_data['status'].unique)
print(final_data['item type'].unique())

In [None]:
final_data

In [None]:
import pandas as pd

# Check the distribution of the target variable
class_distribution = final_data['status'].value_counts()

# print distribution
print("Class Distribution:")
print(class_distribution)

# Check if the classes are balanced or imbalanced
if len(class_distribution) == 2:
    minority_class, majority_class = class_distribution.index
    minority_samples = class_distribution[minority_class]
    majority_samples = class_distribution[majority_class]

    imbalance_ratio = majority_samples / minority_samples
    print("\nImbalance Ratio:",imbalance_ratio)

    if imbalance_ratio > 1.5: # Adjust this threshold based on our problem
        print("The classes are imbalanced.")
    else:
        print("The classes are balanced.")
else:
    print("Not a binary classification problem.")

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

In [None]:
final_data.info()

In [None]:
# Define target variable 'y_new' and feature 'x_new'
y_new = final_data['status']
x_new = final_data.drop('status',axis=1)

In [None]:
def machine_learning_classification(x_new, y_new, algorithm):
    x_train, x_test, y_train, y_test = train_test_split(x_new, y_new, test_size=0.2, random_state=42)

    model = algorithm().fit(x_train, y_train)

    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)

    accuracy_train = metrics.accuracy_score(y_train, y_pred_train)
    accuracy_test = metrics.accuracy_score(y_test, y_pred_test)

    return {
        'algorithm': algorithm.__name__,
        'accuracy_train': accuracy_train,
        'accuracy_test' : accuracy_test
    }

classifiers = [
    DecisionTreeClassifier,
    ExtraTreesClassifier,
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    XGBClassifier
]

results = []

for classifier in classifiers:
    accuracy_metrics = machine_learning_classification(x_new, y_new, classifier)
    results.append(accuracy_metrics)

# Creating a DataFrame from the results
results_df = pd.DataFrame(results)
results_df

In [None]:
# Checking the accuracy_train b/w accuracy_test diferance
results_df['accuracy_train']-results_df['accuracy_test']

In [None]:
# chossing ExtraTreeClassifier high accuracy on both the training set (1.0) and the test set (0.971)
# Fit an ExtraTreeClassifier model to the training set
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

x_train, x_test, y_train, y_test = train_test_split(x_new, y_new, test_size=0.2, random_state=42)

model = ExtraTreesClassifier().fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
# AWAY FROM THIS POINT, THE CODE IS NOT USED IN THE PROJECT
# df["item_date_1"] = pd.to_datetime(df["item_date"].astype(str).str.split('.').str[0],format='%y%m%d',errors='coerce')
# df["delivery_date_1"] = pd.to_datetime(df["delivery date"].astype(str).str.split('.').str[0],format='%y%m%d',errors='coerce')
# df["quantity tons"] = pd.to_numeric(df["quantity tons"],errors='coerce')

In [None]:
# df["item_date_1"] = pd.to_datetime(df["item_date"], unit="s", errors="coerce")
# df["delivery date_1"] = pd.to_datetime(df["delivery date"], unit="s", errors="coerce")
# df["quantity tons"] = pd.to_numeric(df["delivery date"], errors="coerce")

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# removing the "00000" datas from "material_ref" column

df["material_ref"] = df["material_ref"].apply(lambda x: np.nan if str(x).startswith("00000") else x)

In [None]:
df.isnull().sum()


In [None]:
# "material_ref" have a maximum number of null values ,so we need to drop that column
# id is a unique value so we also drop that column

df.drop(columns=["id","material_ref"],inplace=True)

In [None]:
df.isnull().sum()


In [None]:
df.describe().T

In [None]:
print(df[['item_date_1', 'delivery_date_1']].head())
print(df[['item_date_1', 'delivery_date_1']].isna().sum())

In [None]:
# converting selling_price values into null values

df["selling_price"] = df["selling_price"].apply(lambda x: np.nan if x <= 0 else x)

In [None]:
df.nunique()

In [None]:
df.info()

In [None]:
df.isnull().sum()

# Hanlding the null values using mean(), median() and mode()

In [None]:
# object columns and mode method
df["item_date_1"].fillna(df["item_date_1"].mode().iloc[0],inplace=True)
df["delivery date_1"].fillna(df["delivery date_1"].mode().iloc[0],inplace=True)
df["status"].fillna(df["status"].mode().iloc[0],inplace=True)
df["item_date"].fillna(df["item_date_1"].mode().iloc[0],inplace=True)
df["delivery date"].fillna(df["delivery date"].mode().iloc[0],inplace=True)

In [None]:
df.isnull().sum()

In [None]:
# numerical columns and median()
df["quantity tons"].fillna(df["quantity tons"].median(),inplace=True)
df["customer"].fillna(df["customer"].median(),inplace=True)
df["country"].fillna(df["country"].median(),inplace=True)
df["application"].fillna(df["application"].median(),inplace=True)
df["thickness"].fillna(df["thickness"].median(),inplace=True)
df["selling_price"].fillna(df["selling_price"].median(),inplace=True)

In [None]:
df.isnull().sum()


# Encoding categorical columns


In [None]:
df.head()

In [None]:
df["status"].unique()

In [None]:
states = {'Won':1,
          'Draft':2,
          'To be approved':3,
          'Lost':0,
          'Not lost for AM':4,
          'Wonderful':5,
          'Revised':6,
          'Offered':7,
          'Offerable':8}

df["status"] = df["status"].map(states)


In [None]:
df["status"].unique()

In [None]:
df["item type"].unique()

In [None]:
item_t = {'W':0,
          'WI':1,
          'S':2,
          'Others':3,
          'PL':4,
          'IPL':5,
          'SLAWR':6}

df["item type"] = df["item type"].map(item_t)

In [None]:
df["item type"].unique()

In [None]:
df.describe().T

In [None]:
df["item_date_1"].isnull().sum()

In [None]:
df["delivery date_1"].std()

In [None]:
print(df['item_date_1'].dtype)
print(df['delivery date_1'].dtype)

In [None]:
print(df['item_date_1'].nunique())
print(df['delivery date_1'].nunique())