In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #matplotlib.pyplot is a collection of command style functions that make matplotlib work like MATLAB
import seaborn as sns #Python data visualization library based on matplotlib.
plt.style.use('ggplot')
import plotly.graph_objs as go # For analysis and styling graphs
from plotly.offline import init_notebook_mode, iplot, download_plotlyjs
import cufflinks as cf # a library for easy interactive Pandas charting with Plotly.Cufflinks binds Plotly directly to pandas dataframes.
init_notebook_mode(connected=True) #Run at the start of every ipython notebook to use plotly.offline. This injects the plotly.js source files into the notebook.
cf.go_offline()
import warnings
warnings.filterwarnings('ignore')
#import os

In [2]:
df = pd.read_csv('BlackFriday.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 537577 entries, 0 to 537576
Data columns (total 12 columns):
User_ID                       537577 non-null int64
Product_ID                    537577 non-null object
Gender                        537577 non-null object
Age                           537577 non-null object
Occupation                    537577 non-null int64
City_Category                 537577 non-null object
Stay_In_Current_City_Years    537577 non-null object
Marital_Status                537577 non-null int64
Product_Category_1            537577 non-null int64
Product_Category_2            370591 non-null float64
Product_Category_3            164278 non-null float64
Purchase                      537577 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 39.0+ MB


In [4]:
#checking data for the percentage of null values.df.drop(labels='Product_Category_3', axis=1, inplace=True)
temp_df = df.isnull().sum().reset_index()
temp_df['Percentage of Null Values'] = temp_df[0]/len(df)*100
temp_df.columns = ['Column Name', 'Number of Null Values','Percentage of Null Values']
temp_df

Unnamed: 0,Column Name,Number of Null Values,Percentage of Null Values
0,User_ID,0,0.0
1,Product_ID,0,0.0
2,Gender,0,0.0
3,Age,0,0.0
4,Occupation,0,0.0
5,City_Category,0,0.0
6,Stay_In_Current_City_Years,0,0.0
7,Marital_Status,0,0.0
8,Product_Category_1,0,0.0
9,Product_Category_2,166986,31.062713


In [5]:
df.drop(labels='Product_Category_3', axis=1, inplace=True)
# dropping product_category_3 column

In [6]:
df.drop(labels='Product_ID',axis=1,inplace=True)

In [7]:
df.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Purchase
count,537577.0,537577.0,537577.0,537577.0,370591.0,537577.0
mean,1002992.0,8.08271,0.408797,5.295546,9.842144,9333.859853
std,1714.393,6.52412,0.491612,3.750701,5.087259,4981.022133
min,1000001.0,0.0,0.0,1.0,2.0,185.0
25%,1001495.0,2.0,0.0,1.0,5.0,5866.0
50%,1003031.0,7.0,0.0,5.0,9.0,8062.0
75%,1004417.0,14.0,1.0,8.0,15.0,12073.0
max,1006040.0,20.0,1.0,18.0,18.0,23961.0


In [8]:
df['Product_Category_2'].fillna(9,inplace=True)
df.head()

Unnamed: 0,User_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Purchase
0,1000001,F,0-17,10,A,2,0,3,9.0,8370
1,1000001,F,0-17,10,A,2,0,1,6.0,15200
2,1000001,F,0-17,10,A,2,0,12,9.0,1422
3,1000001,F,0-17,10,A,2,0,12,14.0,1057
4,1000002,M,55+,16,C,4+,0,8,9.0,7969


In [9]:
# max products were purchased by males.
gender = df['Gender'].value_counts().reset_index()
gender.columns = ['Gender', 'Count']
#status['index'] = status['index'].apply(lambda x: 'M' if x==0 else 'F')
gender.iplot(kind='pie', labels='Gender', values='Count',title='Ratio of Male and Female')

In [10]:
# which age group purchased most products?
age = df['Age'].value_counts().reset_index()
age.columns = ['Age Group', 'Count']
age.iplot(kind='bar', x='Age Group', y='Count', title='Number of people belonging to different age groups',
         xTitle='Age Group', yTitle='Quantity', color='green')

In [11]:
# married and non married population
status = df['Marital_Status'].value_counts().reset_index()
# Converting 0 and 1 into Married and Non Married in status DataFrame
status['index'] = status['index'].apply(lambda x: 'Non Married' if x==0 else 'Married')

status.iplot(kind='pie', labels='index', values='Marital_Status', hole=0.1, pull=0.1,
          title='Ratio of Married people and Non Married people')
status.T

Unnamed: 0,0,1
index,Non Married,Married
Marital_Status,317817,219760


In [12]:
# to predict products purchased max no of times
pro1 = df['Product_Category_1'].reset_index()
pro1.columns = ['index', 'product']

pro2 = df['Product_Category_2'].reset_index()
pro2.columns = ['index', 'product']
product = pd.concat([pro1, pro2] )

product = product['product'].value_counts().reset_index()[:8]
product.iplot(kind='bar', x='index', y='product', title='Top 8 product number which purchase maximum number of times',
             xTitle='Product number', yTitle='Frequency', color='violet')

In [14]:
#df=pd.get_dummies(df,columns=['Gender','Age','City_Category'])

In [15]:

df['Stay_In_Current_City_Years'].replace('4+','4',inplace=True)
df['Stay_In_Current_City_Years']=pd.to_numeric(df['Stay_In_Current_City_Years'])
df.head(7)

Unnamed: 0,User_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Purchase
0,1000001,F,0-17,10,A,2,0,3,9.0,8370
1,1000001,F,0-17,10,A,2,0,1,6.0,15200
2,1000001,F,0-17,10,A,2,0,12,9.0,1422
3,1000001,F,0-17,10,A,2,0,12,14.0,1057
4,1000002,M,55+,16,C,4,0,8,9.0,7969
5,1000003,M,26-35,15,A,3,0,1,2.0,15227
6,1000004,M,46-50,7,B,2,1,1,8.0,19215


In [18]:
import numpy
import matplotlib.pyplot as plot
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [19]:
# Import the dataset
dataset = pd.read_csv('BlackFriday.csv')
x= dataset.iloc[:, :-1].values
y = dataset.iloc[:, 1].values

# Split the dataset into the training set and test set
# We're splitting the data in 1/3, so out of 30 rows, 20 rows will go into the training set,
# and 10 rows will go into the testing set.
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 1/3)

# Creating a LinearRegression object and fitting it
# on our trainging set.
linearRegressor = LinearRegression()
linearRegressor.fit(xTrain, yTrain)

# Predicting the test set results
yPrediction = linearRegressor.predict(xTest)

# Visualising the training set results
plot.scatter(xTrain, yTrain, color = 'red')
plot.plot(xTrain, linearRegressor.predict(yTrain), color = 'blue')
plot.title('Product_Category_2 vs Purchase (Training set)')
plot.xlabel('Product_Category_2')
plot.ylabel('Purchase')
plot.show()

# Visualising the test set results
plot.scatter(xTest, yTest, color = 'red')
plot.plot(xTest, linearRegressor.predict(xTest), color = 'blue')
plot.title('Product_Category_2 vs Purchase (Test set)')
plot.xlabel('Product_Category_2')
plot.ylabel('Purchase')
plot.show()

ValueError: could not convert string to float: 'P00277442'