# Why is logistic regression needed ?

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

# Generate synthetic data (age and bought)
np.random.seed(0)
age = np.random.randint(18, 65, 100)  # Age between 18 and 65
bought = np.where(age > 30, 1, 0)  # People older than 30 are more likely to buy

# Reshape age to a 2D array as required by scikit-learn
X = age.reshape(-1, 1)

In [None]:


import pandas as pd

# Create a DataFrame
df = pd.DataFrame({'age': age, 'bought': bought})
print(df.head())


In [None]:
df.info()

In [None]:
# Create and train the linear regression model
model = LinearRegression()
model.fit(X, bought)

# Make predictions on new data
new_ages = np.array([25, 35, 45, 55]).reshape(-1, 1)  # Reshape to 2D
predictions = model.predict(new_ages)

# Print predictions
print("Predictions:")
for i in range(len(new_ages)):
    print(f"Age: {new_ages[i][0]}, Predicted bought: {predictions[i]}")

In [None]:
# prompt: plot the data and linear regression line

import matplotlib.pyplot as plt

# Plot the data points
plt.scatter(age, bought, color='blue', label='Data Points')

# Plot the regression line
plt.plot(new_ages, predictions, color='red', label='Linear Regression Line')

# Add labels and title
plt.xlabel("Age")
plt.ylabel("Bought")
plt.title("Linear Regression of Age vs. Bought")
plt.legend()

# Show the plot
plt.show()


# Build linear regression

In [None]:


import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# Load the data from the URL
url = "https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Product%20Sales%20Data/Product_sales.csv"
df = pd.read_csv(url)

# Prepare the data
X = df[["Age"]] # Features (independent variable)
y = df["Bought"] # Target variable (dependent variable)

# Create and train the linear regression model
model = LinearRegression()
model.fit(X, y)

# Make predictions
y_pred = model.predict(X)

# Plot the data and the linear regression line
plt.scatter(X, y, color='blue', label='Data Points')
plt.plot(X, y_pred, color='red', label='Linear Regression Line')
plt.xlabel("Sales")
plt.ylabel("Profit")
plt.title("Linear Regression of Sales vs. Profit")
plt.legend()
plt.show()


In [None]:
print("intercept",model.intercept_)
print("Co-efficient",model.coef_)

In [None]:
# Make predictions on new data
new_ages = np.array([25, 35, 45, 55]).reshape(-1, 1)  # Reshape to 2D
predictions = model.predict(new_ages)

# Print predictions
print("Predictions:")
for i in range(len(new_ages)):
    print(f"Age: {new_ages[i][0]}, Predicted bought: {predictions[i]}")

# Build logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(df[["Age"]],df["Bought"])

In [None]:
print("Intercept", logistic.intercept_)
print("Coefficient", logistic.coef_)

In [None]:
# Make predictions on new data
new_ages = np.array([25, 35, 45, 55]).reshape(-1, 1)  # Reshape to 2D
predictions = logistic.predict(new_ages)

# Print predictions
print("Predictions:")
for i in range(len(new_ages)):
    print(f"Age: {new_ages[i][0]}, Predicted bought: {predictions[i]}")

In [None]:
new_data=df.drop(["Bought"], axis=1)
#Pass the variables to get the predicted values. Add actual values in a new column
new_data["pred_values"]= logistic.predict_proba(new_data)[:,1]
new_data["Actual"]=df["Bought"]

#Sort the data and draw the graph
new_data=new_data.sort_values(["pred_values"])
plt.scatter(new_data["Age"], new_data["Actual"])
plt.plot(new_data["Age"], new_data["pred_values"], color='red')
#Add lables and title
plt.title('Predicted vs Actual Plot')
plt.xlabel('Age')
plt.ylabel('Bought')
plt.show()

#Probability vs Logit Function

In [None]:
# prompt: plot between probability and its logit function. import all necessary packages

import numpy as np
import matplotlib.pyplot as plt

# Generate probability values between 0 and 1
probabilities = np.linspace(0.01, 0.99, 100)  # Avoid 0 and 1 for logit

# Calculate the logit for each probability
logits = np.log(probabilities / (1 - probabilities))

# Create the plot
plt.figure(figsize=(12,8),dpi=300)
# plt.plot(logits,probabilities)
plt.plot(probabilities,logits)

# Add labels and title
plt.xlabel("Probability")
plt.ylabel("Logit")
plt.title("Probability vs. Logit Function")
plt.grid(True)

# Display the plot
plt.show()


# logit vs probability

In [None]:
# prompt: Generate logit data and then compute the probability. draw the figure

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# ... (Existing code from the provided file) ...

# Generate logit data and compute probabilities
logits = np.linspace(-50, 50, 1000)  # Generate logit values
probabilities = 1 / (1 + np.exp(-logits)) # Compute the probabilities using the sigmoid function

# Create the plot
plt.figure(figsize=(12,8),dpi=300)
plt.plot(logits, probabilities, label="Sigmoid function", color="blue")

# Add labels and title
plt.xlabel("Logit")
plt.ylabel("Probability")
plt.title("Logit vs Probability")
plt.grid(True)
plt.legend()

# Add points to visualize the relationship
plt.scatter(logits, probabilities, s = 10, c="red")


# Display the plot
plt.show()


# Multipl Logistic Regression

In [None]:
Fiber=pd.read_csv("https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Fiberbits/Fiberbits_v1.csv")

In [None]:
Fiber.info()

In [None]:
logistic1= LogisticRegression()
###fitting logistic regression for active customer on rest of the variables#######
logistic1.fit(Fiber[["income"]+['months_on_network']+['Num_complaints']+['number_plan_changes']+['relocated']+['monthly_bill']+['technical_issues_per_month']+['Speed_test_result']],Fiber[['active_cust']])


In [None]:
print("Intercept", logistic1.intercept_)
print("Coefficients", logistic1.coef_)

# Confusion Matrix & Accuracy

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [None]:
predict1=logistic1.predict(Fiber[["income"]+['months_on_network']+['Num_complaints']+['number_plan_changes']+['relocated']+['monthly_bill']+['technical_issues_per_month']+['Speed_test_result']])
predict1

cm1 = confusion_matrix(Fiber[['active_cust']],predict1)
print(cm1)

In [None]:
print("col sums", sum(cm1))
total1=sum(sum(cm1))
print("Total", total1)


In [None]:
accuracy1=(cm1[0,0]+cm1[1,1])/total1
accuracy1

# Multicollinearity

In [None]:
import statsmodels.formula.api as sm

def vif_cal(input_data, dependent_col):
    x_vars=input_data.drop([dependent_col], axis=1)
    xvar_names=x_vars.columns
    for i in range(0,xvar_names.shape[0]):
        y=x_vars[xvar_names[i]]
        x=x_vars[xvar_names.drop(xvar_names[i])]
        rsq=sm.ols(formula="y~x", data=x_vars).fit().rsquared
        vif=round(1/(1-rsq),2)
        print (xvar_names[i], " VIF = " , vif)

In [None]:
vif_cal(input_data=Fiber, dependent_col="active_cust")

In [None]:
#Drop the variable with highest VIF
vif_cal(input_data=Fiber.drop("number_plan_changes", axis=1), dependent_col="active_cust")

# Individual Impact of Variables

In [None]:
import statsmodels.api as sm
m1=sm.Logit(Fiber['active_cust'],Fiber[["income"]+['months_on_network']+['Num_complaints']+['relocated']+['monthly_bill']+['technical_issues_per_month']+['Speed_test_result']])
m1.fit()
print(m1.fit().summary())

In [None]:
m1=sm.Logit(Fiber['active_cust'],Fiber[["income"]+['months_on_network']+['Num_complaints']+['relocated']+['technical_issues_per_month']+['Speed_test_result']])
m1.fit()
print(m1.fit().summary())

# Rank Ordering of the impactful variables

In [None]:
m1=sm.Logit(Fiber['active_cust'],Fiber[["income"]+['months_on_network']+['Num_complaints']+['relocated']+['technical_issues_per_month']+['Speed_test_result']])
m1.fit()
print(m1.fit().summary())