# **Association**

### Scatter plot

In [None]:


import matplotlib.pyplot as plt

import numpy as np

# Generate 100 samples between 1 and 200
x = np.random.randint(1, 1201, size=500)
y = np.random.randint(1, 201, size=500) #[-2 * val for val in x]

plt.scatter(x, y)
plt.xlabel("X-axis")
plt.ylabel("Y-axis")
plt.title("Scatter Plot of X and Y")
plt.show()


### Linear negative association

In [None]:


import matplotlib.pyplot as plt

import numpy as np

# Generate 100 samples between 1 and 200
x = np.random.randint(1, 1201, size=50)
y = [-2 * val for val in x]

plt.scatter(x, y)
plt.xlabel("X-axis")
plt.ylabel("Y-axis")
plt.title("Scatter Plot of X and Y")
plt.show()


### linear positive association

In [None]:
# prompt: draw scatterplot for x and y

import matplotlib.pyplot as plt

import numpy as np

# Generate 100 samples between 1 and 200
x = np.random.randint(1, 1201, size=50)
y = [3 * val for val in x]

plt.scatter(x, y)
plt.xlabel("X-axis")
plt.ylabel("Y-axis")
plt.title("Scatter Plot of X and Y")
plt.show()


# Observe Correlation Using Heat Map

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

# Generate 50 random numbers for the base variables
X1 = np.random.randint(1, 100, 50)  # Random values from 1 to 100
X2 = X1 + np.random.normal(0, 5, 50)  # Strong positive correlation with X1
X3 = np.random.randint(1, 100, 50)  # Random values
X4 = -X3 + np.random.normal(0, 5, 50)  # Strong negative correlation with X3
X5 = X1 * 0.2 + np.random.normal(0, 10, 50)  # Weak positive correlation with X1
X6 = np.random.randint(1, 100, 50)  # No correlation with any variable

# Create DataFrame
df = pd.DataFrame({'X1': X1, 'X2': X2, 'X3': X3, 'X4': X4, 'X5': X5, 'X6': X6})

# Display first few rows
print(df.head())


In [None]:
correlation_matrix = df.corr()
print(correlation_matrix)


In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


 **Interpretation**
- **Correlation Heatmap** (Left Plot):  
  - Shows how strongly two variables are correlated.  
  - Dark red = strong positive correlation.  
  - Dark blue = strong negative correlation.  
  - Near zero = no correlation.  







# Linear Regression

## Case Study: Air passenger prediction
### Correlation among variables

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

df = pd.read_csv('https://raw.githubusercontent.com/venkatareddykonasani/Datasets/refs/heads/master/AirPassengers/AirPassengers.csv')

In [None]:
df.columns

In [None]:
# prompt: plot heat plot using seaborn for only numerical columns

import seaborn as sns
import matplotlib.pyplot as plt

# Select only numerical columns
numerical_cols = df.select_dtypes(include=['number']).columns
df_numerical = df[numerical_cols]

# Calculate the correlation matrix
correlation_matrix = df_numerical.corr()

# Create the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix Heatmap (Numerical Columns)')
plt.show()


### Correlation between each feature and target variable

In [None]:
for col in df.select_dtypes(include=['number']).columns:
  print("Correlation between ",col," and Passengers")
  print("                                       ")
  print(np.corrcoef(df.Passengers,df[col]))
  print("---------------------------------")
  # print(col)

# np.corrcoef(air.Passengers,air.Promotion_Budget)

### Model - 1: Promotion Budget vs Passengers

In [None]:

# Target variable
y = df["Passengers"]

# Feature
X1 = df[["Promotion_Budget"]]



# Train Linear Regression models
model_high = LinearRegression().fit(X1, y)

# Predictions
y_pred = model_high.predict(X1)


# Calculate R² scores
r2_high = r2_score(y, y_pred)

print(f"R² Score ( Promotion Budget vs Passengers): {r2_high:.4f}")


# Model
sns.scatterplot(x=df['Promotion_Budget'], y=y, label="Actual")
sns.lineplot(x=df['Promotion_Budget'], y=y_pred, color="red", label="Predicted")
axes[0].set_title(f" R²:  Promotion Budget vs Passengers ({r2_high:.4f})")

plt.tight_layout()
plt.show()


### Model - 2: Inter Metro Flight Ratio vs Passengers

In [None]:

# Target variable

# Feature
X1 = df[["Inter_metro_flight_ratio"]]


# Train Linear Regression models
model_high = LinearRegression().fit(X1, y)

# Predictions
y_pred = model_high.predict(X1)


# Calculate R² scores
r2_high = r2_score(y, y_pred)

print(f"R² Score (Inter_metro_flight_ratio vs Passengers ): {r2_high:.4f}")
# Plot results
# fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# High Correlation Model
sns.scatterplot(x=df['Inter_metro_flight_ratio'], y=y, label="Actual")
sns.lineplot(x=df['Inter_metro_flight_ratio'], y=y_pred, color="red", label="Predicted")
axes[0].set_title(f"R²: Inter_metro_flight_ratio vs Passengers ({r2_high:.4f})")

plt.tight_layout()
plt.show()


### Model - 3: Service Quality Score vs Passengers

In [None]:

# Target variable
y = df["Passengers"]

# Feature
X1 = df[["Service_Quality_Score"]]


# Train Linear Regression models
model_high = LinearRegression().fit(X1, y)

# Predictions
y_pred = model_high.predict(X1)


# Calculate R² scores
r2_high = r2_score(y, y_pred)

print(f"R² Score (Service_Quality_Score vs Passengers): {r2_high:.4f}")
# Plot results
# fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# High Correlation Model
sns.scatterplot(x=df['Service_Quality_Score'], y=y, label="Actual")
sns.lineplot(x=df['Service_Quality_Score'], y=y_pred, color="red", label="Predicted")
axes[0].set_title(f"R²: Service_Quality_Score vs Passengers({r2_high:.4f})")

plt.tight_layout()
plt.show()


### **Model for Predicting Number of passangers**

In [None]:
import statsmodels.formula.api as sm
model = sm.ols(formula='Passengers ~ Promotion_Budget', data=df)
fitted1 = model.fit()
print(fitted1.summary())

In [None]:
# y = b0 +b1*x
# Passengers = 1259.6058 + 0.0695 * (promo_budget)
# promo_budget = 500,000 ==> Predict passengers ?

1259.6058 + 0.0695 * (500000)

# **Multiple Linear Regression**

In [None]:

# Target variable
y = df["Passengers"]


X1 = df_numerical.drop(['Passengers'],axis=1)


# Train Linear Regression models
model_high = LinearRegression().fit(X1, y)

# Predictions
y_pred = model_high.predict(X1)


# Calculate R² scores
r2_high = r2_score(y, y_pred)

print(f"R² Score (High Correlation - LSTAT vs MEDV): {r2_high:.4f}")

In [None]:
model = sm.ols(formula='Passengers ~ Promotion_Budget+Service_Quality_Score+Inter_metro_flight_ratio', data=df)
fitted = model.fit()
print(fitted.summary())

### Impact of Individual variables

In [None]:
#Are there any predictor variables that are not impacting the dependent variable
##Inter_metro_flight_ratio is dropped
import statsmodels.formula.api as sm
model = sm.ols(formula='Passengers ~ Promotion_Budget+Service_Quality_Score', data=df)
fitted = model.fit()
print(fitted.summary())

### **Adjusted R square**

In [None]:
adj_sample=pd.read_csv("https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Adjusted%20RSquare/Adj_Sample.csv")
#Build a model to predict y using x1,x2 and x3. Note down R-Square and Adj R-Square values
model = sm.ols(formula='Y ~ x1+x2+x3', data=adj_sample)
fitted = model.fit()
print(fitted.summary())
#R-Squared

In [None]:
#Model2
model = sm.ols(formula='Y ~ x1+x2+x3+x4+x5+x6', data=adj_sample)
fitted = model.fit()
print(fitted.summary())

In [None]:
#Model3
model = sm.ols(formula='Y ~ x1+x2+x3+x4+x5+x6+x7+x8', data=adj_sample)
fitted = model.fit()
print(fitted.summary())

### Multiple Regression- issues

In [None]:
#Import Regional Sales data
regional_sales=pd.read_csv("https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Regional_Sales/Sales_by_Region.csv")

#Variable names
regional_sales.columns

In [None]:
model1 = sm.ols(formula='Regional_Sales ~ Avg_Income+Avg_Expenses+Percent_Male+Percent_Female', data=regional_sales)
fitted1 = model1.fit()
print(fitted1.summary())

Model After dropping Avg Income

In [None]:
model1 = sm.ols(formula='Regional_Sales ~ Avg_Expenses+Percent_Male+Percent_Female', data=regional_sales)
fitted1 = model1.fit()
print(fitted1.summary())

### VIF

In [None]:
#Code for VIF Calculation
#Writing a function to calculate the VIF values

def vif_cal(input_data):
    x_vars = input_data
    xvar_names=x_vars.columns
    for i in range(0,xvar_names.shape[0]):
        y=x_vars[xvar_names[i]]
        x=x_vars[xvar_names.drop(xvar_names[i])]
        rsq=sm.ols(formula="y~x", data=x_vars).fit().rsquared
        vif=round(1/(1-rsq),2)
        print (xvar_names[i], " VIF = " , vif)

In [None]:
#Calculating VIF values using that function
X_Data=regional_sales.drop(["Region_id","Regional_Sales"],axis=1)
vif_cal(input_data=X_Data)

In [None]:
vif_cal(input_data=X_Data.drop(["Avg_Income"], axis=1))

In [None]:
vif_cal(input_data=X_Data.drop(["Avg_Income","Percent_Female"], axis=1))

In [None]:
model1 = sm.ols(formula='Regional_Sales ~ Avg_Expenses+Percent_Male', data=regional_sales)
fitted1 = model1.fit()
print(fitted1.summary())

# Multiple Regression model building steps

In [None]:
Webpage_Product_Sales=pd.read_csv("https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Webpage_Product_Sales/Webpage_Product_Sales.csv")
Webpage_Product_Sales.shape
Webpage_Product_Sales.columns

In [None]:
model1 = sm.ols(formula='Sales ~ Web_UI_Score+Server_Down_time_Sec+Holiday+Special_Discount+Clicks_From_Serach_Engine+Online_Ad_Paid_ref_links+Social_Network_Ref_links+Month+Weekday+DayofMonth', data=Webpage_Product_Sales)
fitted1 = model1.fit()
print(fitted1.summary())

In [None]:
model1 = sm.ols(formula='Sales ~ Web_UI_Score+Server_Down_time_Sec+Holiday+Special_Discount+Online_Ad_Paid_ref_links+Social_Network_Ref_links+Month+Weekday+DayofMonth', data=Webpage_Product_Sales)
fitted1 = model1.fit()
print(fitted1.summary())

In [None]:
#VIF
vif_cal(Webpage_Product_Sales.drop(["Clicks_From_Serach_Engine"], axis=1))#,"Sales")

In [None]:
model1 = sm.ols(formula='Sales ~ Web_UI_Score+Server_Down_time_Sec+Holiday+Special_Discount+Online_Ad_Paid_ref_links+Social_Network_Ref_links+Month+Weekday+DayofMonth', data=Webpage_Product_Sales)
fitted1 = model1.fit()
print(fitted1.summary())