# Assignment 2.1 Use Case - Tayko Software Cataloger

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pylab as plt

import dmba
from dmba import regressionSummary
from dmba import adjusted_r2_score, AIC_score, BIC_score

%matplotlib inline

no display found. Using non-interactive Agg backend


This case study examines how Tayko, a software catalog firm that sells games and educational software, prepared to launch a revised collection of items in a new catalog mailing. Tayko began as a software manufacturer and later expanded its offerings to include third-party titles, making its customer list a valuable asset. Because mailing is costly and most recipients are unlikely to purchase, the objective is to minimize wasted expenses by identifying customers with a higher probability of responding. In this analysis, logistic regression, multiple linear regression, and regression trees are applied to recommend which customers should receive the catalog and to estimate their expected spending, with the goal of optimizing the mailing campaign and improving gross profits.


In [3]:
df=pd.read_csv('Tayko.csv')

# Basic information
print("Shape of dataset:", df.shape)
print("\nData types:\n", df.dtypes)
df.head()

Shape of dataset: (2000, 25)

Data types:
 sequence_number         int64
US                      int64
source_a                int64
source_c                int64
source_b                int64
source_d                int64
source_e                int64
source_m                int64
source_o                int64
source_h                int64
source_r                int64
source_s                int64
source_t                int64
source_u                int64
source_p                int64
source_x                int64
source_w                int64
Freq                    int64
last_update_days_ago    int64
1st_update_days_ago     int64
Web order               int64
Gender=male             int64
Address_is_res          int64
Purchase                int64
Spending                int64
dtype: object


Unnamed: 0,sequence_number,US,source_a,source_c,source_b,source_d,source_e,source_m,source_o,source_h,...,source_x,source_w,Freq,last_update_days_ago,1st_update_days_ago,Web order,Gender=male,Address_is_res,Purchase,Spending
0,1,1,0,0,1,0,0,0,0,0,...,0,0,2,3662,3662,1,0,1,1,128
1,2,1,0,0,0,0,1,0,0,0,...,0,0,0,2900,2900,1,1,0,0,0
2,3,1,0,0,0,0,0,0,0,0,...,0,0,2,3883,3914,0,0,0,1,127
3,4,1,0,1,0,0,0,0,0,0,...,0,0,1,829,829,0,1,0,0,0
4,5,1,0,1,0,0,0,0,0,0,...,0,0,1,869,869,0,0,0,0,0


In [4]:
df.describe()

Unnamed: 0,sequence_number,US,source_a,source_c,source_b,source_d,source_e,source_m,source_o,source_h,...,source_x,source_w,Freq,last_update_days_ago,1st_update_days_ago,Web order,Gender=male,Address_is_res,Purchase,Spending
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1000.5,0.8245,0.1265,0.056,0.06,0.0415,0.151,0.0165,0.0335,0.0525,...,0.018,0.1375,1.417,2155.101,2435.6015,0.426,0.5245,0.221,0.5,102.625
std,577.494589,0.380489,0.332495,0.229979,0.237546,0.199493,0.358138,0.12742,0.179983,0.223089,...,0.132984,0.344461,1.405738,1141.302846,1077.872233,0.494617,0.499524,0.415024,0.500125,186.78261
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,500.75,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1133.0,1671.25,0.0,0.0,0.0,0.0,0.0
50%,1000.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,2280.0,2721.0,0.0,1.0,0.0,0.5,2.0
75%,1500.25,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,3139.25,3353.0,1.0,1.0,0.0,1.0,153.0
max,2000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,15.0,4188.0,4188.0,1.0,1.0,1.0,1.0,1500.0


In [5]:
display(df.columns)

Index(['sequence_number', 'US', 'source_a', 'source_c', 'source_b', 'source_d',
       'source_e', 'source_m', 'source_o', 'source_h', 'source_r', 'source_s',
       'source_t', 'source_u', 'source_p', 'source_x', 'source_w', 'Freq',
       'last_update_days_ago', '1st_update_days_ago', 'Web order',
       'Gender=male', 'Address_is_res', 'Purchase', 'Spending'],
      dtype='object')

In [14]:
## Gross Profit = Expect Revenue - Mailing Cost
# 1. Computer the average spending perv person in the test mailing (Including purchase and non purchased, non purchaser spent = 0)
ave_spending = df['Spending'].mean()

# 2. Get the total revunue which is the answer multiply 180,000

expected_revenue = ave_spending * 180_000
mailing_cost = 180_000 * 2


# 3. Substract the mailing cost: 180,000 x 2 =$360,000
gross_profit = expected_revenue - mailing_cost

print('Average spending per person:', ave_spending)
print('Expected revenue:', expected_revenue)
print('mailing cost:', mailing_cost)
print('Gross profit:', gross_profit)


Average spending per person: 102.625
Expected revenue: 18472500.0
mailing cost: 360000
Gross profit: 18112500.0


## 2. 

In [None]:
predictors = ['']


X = pd.
train_X, valid_X, train_y, valid_y