# Data Preprocessing

In [2]:
import numpy as np
import pandas as pd

In [17]:
df = pd.read_csv("TATA Apr-25 CSI.csv")
df.head()

Unnamed: 0,SA Professional Approach,SA Friendliness,SA Product Know.,On The Promised Time,Completed First Time,SA Explain The Work,Cleaned Properly,Overall Evaluation,Satisfaction,Concern (Yes/No),NPS,ADVISOR NAME
0,9,9,9,9,9,9,9,9.0,Satisfied,No,9.0,Samith Wickramasinghe
1,9,9,9,9,9,9,9,9.0,Satisfied,No,,Lakmal Perera
2,9,9,9,9,9,9,9,9.0,Satisfied,No,,Chathuranga de silva
3,10,10,10,10,10,10,10,10.0,Extremely Satisfied,No,10.0,CHIRAN MEDAGEDARA
4,10,10,10,8,10,10,10,10.0,Satisfied,No,10.0,Deepal Kariyawasam


In [19]:
df.isnull().sum()

SA Professional Approach     0
SA Friendliness              0
SA Product Know.             0
On The Promised Time         0
Completed First Time         0
SA Explain The Work          0
Cleaned Properly             0
Overall Evaluation           1
Satisfaction                 0
Concern (Yes/No)             0
NPS                         32
ADVISOR NAME                 0
dtype: int64

In [None]:
df = df.drop(df.columns[10], axis=1)

In [21]:
df.isnull().sum()

SA Professional Approach    0
SA Friendliness             0
SA Product Know.            0
On The Promised Time        0
Completed First Time        0
SA Explain The Work         0
Cleaned Properly            0
Overall Evaluation          1
Satisfaction                0
Concern (Yes/No)            0
ADVISOR NAME                0
dtype: int64

In [22]:
df_cleaned = df.dropna()

In [23]:
df_cleaned.isnull().sum()

SA Professional Approach    0
SA Friendliness             0
SA Product Know.            0
On The Promised Time        0
Completed First Time        0
SA Explain The Work         0
Cleaned Properly            0
Overall Evaluation          0
Satisfaction                0
Concern (Yes/No)            0
ADVISOR NAME                0
dtype: int64

In [24]:
df_cleaned.shape

(186, 11)

In [26]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 186 entries, 0 to 186
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   SA Professional Approach  186 non-null    int64  
 1   SA Friendliness           186 non-null    int64  
 2   SA Product Know.          186 non-null    int64  
 3   On The Promised Time      186 non-null    int64  
 4   Completed First Time      186 non-null    int64  
 5   SA Explain The Work       186 non-null    int64  
 6   Cleaned Properly          186 non-null    int64  
 7   Overall Evaluation        186 non-null    float64
 8   Satisfaction              186 non-null    object 
 9   Concern (Yes/No)          186 non-null    object 
 10  ADVISOR NAME              186 non-null    object 
dtypes: float64(1), int64(7), object(3)
memory usage: 17.4+ KB


In [27]:
X = df_cleaned.iloc[:, 0:7]  # Select columns 0 to 6
Y = df_cleaned['Overall Evaluation']  # Select the 'Overall Evaluation' column

In [28]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 186 entries, 0 to 186
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   SA Professional Approach  186 non-null    int64
 1   SA Friendliness           186 non-null    int64
 2   SA Product Know.          186 non-null    int64
 3   On The Promised Time      186 non-null    int64
 4   Completed First Time      186 non-null    int64
 5   SA Explain The Work       186 non-null    int64
 6   Cleaned Properly          186 non-null    int64
dtypes: int64(7)
memory usage: 11.6 KB


# Select Best Regression Model

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [30]:
def model_acc(model):
    model.fit(X_train,y_train)
    acc = model.score(X_test,y_test)
    print(str(model)+":"+str(acc))

In [53]:
# OLS
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
model_acc(lr)

# Ridge
from sklearn.linear_model import Ridge
ridge = Ridge() 
model_acc(ridge)

# Lasso
from sklearn.linear_model import Lasso
lasso = Lasso()
model_acc(lasso)

# ElasticNet
from sklearn.linear_model import ElasticNet
elastic = ElasticNet()
model_acc(elastic)

# Random Forest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
model_acc(rf)

LinearRegression():0.8130196284171692
Ridge():0.8155502436304114
Lasso():0.18443916077009725
ElasticNet():0.5692105811271213
RandomForestRegressor():0.7444344262778391


# Linear Regression

In [33]:
df_cleaned.select_dtypes(include='number').corr()['Overall Evaluation']

SA Professional Approach    0.651712
SA Friendliness             0.661731
SA Product Know.            0.624917
On The Promised Time        0.527731
Completed First Time        0.668639
SA Explain The Work         0.570160
Cleaned Properly            0.573385
Overall Evaluation          1.000000
Name: Overall Evaluation, dtype: float64

In [None]:
y_pred = lr.predict(X_test)

In [35]:
from sklearn.metrics import mean_squared_error, r2_score

In [40]:
# Calculate Mean Squared Error and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 0.14709813166073116
R-squared: 0.8130196284171692


In [50]:
feature_names = X.columns

# Getting the model coefficients (from LinearRegression model)
coefficients = lr.coef_

# Create a DataFrame to display the feature names and their corresponding coefficients
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Print the DataFrame with feature names and corresponding coefficients
print(coef_df)
print(f'Intercept (b): {lr.intercept_}')

                    Feature  Coefficient
0  SA Professional Approach    -0.168126
1           SA Friendliness     0.431936
2          SA Product Know.     0.145395
3      On The Promised Time    -0.011155
4      Completed First Time     0.266479
5      SA Explain The Work      0.190966
6         Cleaned Properly      0.165171
Intercept (b): -0.3271305683469503


In [46]:
import plotly.graph_objects as go

In [47]:
# Step 7: Calculate Feature Importance based on coefficients (with signs)
feature_importance = lr.coef_

# Step 8: Create a DataFrame to hold feature names and their corresponding importance
# Replace the feature names with the actual names from your dataset
feature_names = X.columns
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})

# Sort the features by importance (by absolute value)
importance_df['Absolute Importance'] = importance_df['Importance'].abs()
importance_df = importance_df.sort_values(by='Absolute Importance', ascending=False)

# Step 9: Plot Feature Importance (with signs) using Plotly
fig = go.Figure()

# Add bars for the feature importance
fig.add_trace(go.Bar(
    y=importance_df['Feature'],
    x=importance_df['Importance'],
    orientation='h',  # Horizontal bar plot
    text=importance_df['Importance'].apply(lambda x: f'{x:.2f}'),
    textposition='outside',
    marker=dict(color=importance_df['Importance'], colorscale='RdBu', showscale=True)
))

# Customize layout
fig.update_layout(
    title='Feature Importance in Multiple Linear Regression with Coefficients (+/-)',
    xaxis_title='Importance (Coefficient Value)',
    yaxis_title='Feature',
    template='plotly_dark',
    showlegend=False
)

# Show the plot
fig.show()

### Linear Regression Using 'statmodels' Library

In [58]:
import statsmodels.api as sm

# Step 3: Add a constant (intercept) to the independent variables
X = sm.add_constant(X)  # Adds a column of ones for the intercept

# Step 4: Fit the linear regression model using OLS
model = sm.OLS(Y, X)  # Y is the dependent variable, X is the independent variable with constant
results = model.fit()  # Fit the model

# Step 5: Get the summary of the regression model
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:     Overall Evaluation   R-squared:                       0.625
Model:                            OLS   Adj. R-squared:                  0.610
Method:                 Least Squares   F-statistic:                     42.34
Date:                Wed, 09 Jul 2025   Prob (F-statistic):           9.62e-35
Time:                        11:26:24   Log-Likelihood:                -223.44
No. Observations:                 186   AIC:                             462.9
Df Residuals:                     178   BIC:                             488.7
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   