# Import packages

In [115]:
import pandas as pd


# Reading the data set

In [116]:
ecommerce=pd.read_csv("https://raw.githubusercontent.com/harshdesai7/Ecommerce-Customers/master/Ecommerce%20Customers.csv")

In [None]:
ecommerce

# Exploratory Data Analysis (EDA)

In [None]:
ecommerce.head().transpose()

In [None]:
ecommerce.columns

In [None]:
# We dont need the name address and avatar in analysis since they are independent of analysis
ecommerce=ecommerce.iloc[:,3:]

In [None]:
ecommerce

In [None]:
# Identifiers of columns or attributes of the data are incompatible for some advanced analysis like statsmodels 

ecommerce=ecommerce.rename(columns={'Avg. Session Length':'Avg_Session_Length','Time on App':'Time_on_App','Time on Website':'Time_on_Website','Length of Membership':'Length_of_Membership','Yearly Amount Spent':'Yearly_Amount_Spent'})
ecommerce.columns

In [None]:
ecommerce.head(n=10)

# Analysis before building a model
 Before building a linear regression model we need to find the target column 
and independent variabes(columns).

---
here the task is to predict the Yearly_Amount_Spent by customer based on his feature vector ('Avg_Session_Length,Time_on_App,Time_on_Website,Length_of_Membership)






In [None]:
# Linear Regression Model Assumption is the Independent variable vector or Feature Vector has a property that each variable in feature vector is independent from other features

# hence we need to find whether there is any correlation between the variables of feature vector

# for that we need to calculate the Coefficent of Correlation between each pair of attributes .

# there are 5 Attributes, each attribute can have 4 other combinations so total we have  5*4=20 combinations

# for each combination we need to calculate coefficient of correlation (R)

from scipy.stats import pearsonr
for x in ecommerce.columns:
  for y in ecommerce.columns:
    if x!= y:
      print("Coefficient of Correlation R("+x+","+y+")  is:    "+str(pearsonr(ecommerce[x],ecommerce[y])[0])+"\n")

# Diagramatical representation of data using data visualising


In [None]:
import seaborn as sns
sns.pairplot(ecommerce,diag_kind='kde')

# by the plot we can understand that the relationship between Length_of_Membership and Yearly_Amount_Spent is high

# we have the dataset with all attributes of approximately symmetrical normal distributions

In [None]:
# heat map
sns.heatmap(ecommerce.corr(),cmap='Blues',annot=True)

# from the heat map also we can understand the relation ship between attributes

In [None]:
# if the coefficient of correlation of (x,y) is approximately equal to 0 then those 2 features are independent

# if the coefficient of correlation of (x,y) is greater than 0.2 then those two have a relationship
notrelated=[]
related=[]
for x in ecommerce.columns:
  for y in ecommerce.columns:
    if x!=y:
      if pearsonr(ecommerce[x],ecommerce[y])[0] <= 0.2:
        notrelated.append((x,y))
      else:
        related.append((x,y))
print("Not Related pairs:")
for x in notrelated:
   print(x)
print("\n\nRelated pairs:")
for x in related:
  print(x)

In [None]:
# feature vector is that in which the varibales are strongly related with "Year_Amount_Spent"

# By the above analysis we can conclude that our feature vector is (Avg_Session_Length,Time_on_App,Length_of_Membership)

# Feature vector = Independent variable vector = 'X'

X=ecommerce[['Avg_Session_Length','Time_on_App','Length_of_Membership']]
print(X)

# Dependent Variable 'Y'
Y=ecommerce[['Yearly_Amount_Spent']]
print(Y)

# Linear Regression


1.   Split the data set into train and test
2.   Fit the linear regression model on training set
3.   Predict the test data
4.   Analysis of the model



In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.25)

In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(X_train,Y_train)

# Linear Regression best fit line will be returned i.e the coefficientf of a line will be returned 
# The line is Y=m1*Avg_Session_Length + m2*Time_on_App + m3*Length_of_Membership + intercept

# (m1,m2,m3) are coefficients and one intercept

In [None]:
print("Coefficients:"+str(list(lr.coef_)))
print("Intercept:"+str(lr.intercept_))

In [None]:
predict=lr.predict(X_test)

In [None]:
from matplotlib import pyplot as plt
plt.scatter(Y_test,predict)
plt.xlabel('Y_test')
plt.ylabel('predict')

In [None]:
from sklearn import metrics
print("Mean squared Error:"+str(metrics.mean_squared_error(predict,Y_test)))

In [None]:
print("Performance:",metrics.r2_score(Y_test,predict)*100)

# Analysis using stats model

In [None]:
import statsmodels.formula.api as smf

In [None]:
ecommerce=pd.concat([X_train,Y_train],axis=1)
lm=smf.ols(formula='Yearly_Amount_Spent ~ Avg_Session_Length + Time_on_App + Length_of_Membership',data=ecommerce)

In [None]:
lmodel=lm.fit()

In [None]:
lmodel.params

In [None]:
print(lmodel.summary())

"std err" is the standard deviation of the distribution curve of all possible feature (Ex: look at Avg_Session_Length ,while modelling we get lots of possible values for Avg_Session_Length, "std err" represents the standard deviation of the distribution of all possible values of Avg_Session_Length)

"t" represents the same as z-score that is the difference between best coefficient to the central value of the distribution of all possible values of a column

"p_score" is used to find whether there is any statistical fluke or not (Ex: p_score determines the probability of relationship between the variable and the target in the population where there is no relation between the variable and target)

we got "p_value" for Time_on_Website as 0.773 so that we may conclude that this attribute might be fluked or in another modelling this may give less powered model.
