# Import the data, remove the header, look at the first few lines

In [291]:
import numpy as np
import random
import csv

with open('customertxndata.csv') as infile:
    input_data = csv.reader(infile)
    raw_data = list(input_data)
    header = raw_data[0]
    data = raw_data[1:]

print("Total number of entries:", len(data))
print("Preview of data:")
print(header)

# preview data 
for line in data[:15]: 
    print(line)

    

Total number of entries: 22800
Preview of data:
['Visits', 'Transactions', 'OS', 'Gender', 'Revenue']
['7', '0', 'Android', 'Male', '0']
['20', '1', 'iOS', 'NA', '576.866774966349']
['22', '1', 'iOS', 'Female', '850']
['24', '2', 'iOS', 'Female', '1050']
['1', '0', 'Android', 'Male', '0']
['13', '1', 'Android', 'Male', '460']
['23', '2', 'iOS', 'Male', '1850']
['14', '1', 'Android', 'Male', '480']
['11', '1', 'Android', 'Male', '110']
['24', '2', 'iOS', 'Male', '1950']
['17', '1', 'Android', 'Male', '225']
['14', '1', 'Android', 'NA', '344.651613803221']
['2', '0', 'Android', 'Male', '0']
['8', '1', 'Android', 'NA', '344.651613803221']
['24', '2', 'iOS', 'Male', '1850']


# Remove entries with missing data/Which entries have missing data?
TO-DO flesh this out more

Looking through the file with 'head' and 'tail' suggests that missing data is filled in with 'NA'


In [283]:
# find data with missing information and read it into a list 

complete_data = []
for item in data:
    if 'NA' not in item:
        complete_data.append(item)
    elif 'NA' in item:
        pass

print("Number of complete entries':", len(complete_data))

percent_complete_entries = round(len(complete_data)/len(data) * 100, 2)
print("Percent entries with complete data:", percent_complete_entries)

Number of complete entries': 15600
Percent entries with complete data: 68.42


# Summative Statistics 

### Find the mean number of visits

In [285]:
# use a list comprehension and numpy to concisely extract visits data, concert to int, find sum
visits = [int(row[0]) for row in complete_data]
sum_of_visits = sum(visits)
print("Total visits:", sum_of_visits)

mean_visits = np.mean(visits)
print("Mean visits:", mean_visits)

Total visits: 196445
Mean visits: 12.5926282051


### Find the median revenue 

In [286]:
revenue = [float(row[4]) for row in complete_data]
median_revenue = np.median(revenue)
print("Median revenue:", median_revenue)

Median revenue: 360.0


### Find min and max transactions

In [192]:
transactions = [int(row[1]) for row in complete_data]

# convert to list from generator object to allow use of max() and min() 
transactions_list = list(transactions)

# capture min/max
max_transactions = max(transactions_list)
min_transactions = min(transactions_list)

print("Snapshot of transactions list:\n", transactions_list[:25])
print("Max transactions:", max_transactions)
print("Min transactions:", min_transactions)

Snapshot of transactions list:
 [0, 1, 2, 0, 1, 2, 1, 1, 2, 1, 0, 2, 2, 2, 1, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1]
Max transactions: 2
Min transactions: 0


### Find the most commonly used operating system 
Android by just over 2:1

In [193]:
# extract with list comprehension, use dict of counts idiom
os = [row[2] for row in complete_data]

os_counts = {}

for item in os:
    if item not in os_counts:
        os_counts[item] = 1
    else:
        os_counts[item] += 1
print(os_counts)
        

{'Android': 10711, 'iOS': 4889}


# Impute missing gender data

In [243]:
# let's see if there is a gender-based disparity in spending or visits

male_entries = [entry for entry in complete_data if entry[3] == 'Male']
female_entries = [entry for entry in complete_data if entry[3] == 'Female']

print("Number of male visits:", len(male_entries))
print("Number of female visits:", len(female_entries))

percent_male_visits = len(male_entries)/len(complete_data) * 100 
percent_female_visits = len(female_entries)/len(complete_data) * 100

print("percent male visits:", percent_male_visits)
print("percent female visits:", percent_female_visits)

# double check that the female visits + male visits adds up to the total 
assert(len(male_entries + female_entries) == len(complete_data))
# double check that the two percents add up to 100
assert(percent_male_visits + percent_female_visits == 100)

Number of male visits: 12930
Number of female visits: 2670
percent male visits: 82.88461538461539
percent female visits: 17.115384615384617


 A clear trend emerges in the sex of the visitors, so we will create a function that continues this distribution in imputation of gender data

### Create missing gender imputation function 

In [287]:
# create a function that returns "Male" or "Female" at same distributions as they occur in the complete data set
def impute_gender_data():
    number = random.randint(1,100)
    if number <= percent_male_visits:
        return "Male"
    elif number > percent_male_visits:
        return "Female"

# validate the function by using it 1000 times
test_pool = []
for iteration in range(1000):
    test_pool.append(impute_gender_data())

# find percentile occurences in the test data
test_male = list([item for item in test_pool if item == "Male"])
test_female = list([item for item in test_pool if item == "Female"])

print("Percent male in test:", len(test_male)/len(test_pool) * 100)
print("Percent female in test:", len(test_female)/len(test_pool) * 100)

Percent male in test: 81.2
Percent female in test: 18.8


It seems that our function is working well, so we will use it for the imputation of gender data

### Execute imputation of gender data

In [289]:
# copy data so as not overwrite the orignal value
gender_imputed_data = data
    
# call impute_gender_data() function on all missing values in new data set
for row in gender_imputed_data:
    if row[3] =='NA':
        row[3] = impute_gender_data()


Let's check before and after to make sure (it would probably be more memory efficient to just update data but I am being highly cautious)

In [292]:
print("Before imputation:")
for line in data[:15]:
    print(line)

print("\nAfter imputation:")
for line in gender_imputed_data[:15]: 
    print(line)

Before imputation:
['7', '0', 'Android', 'Male', '0']
['20', '1', 'iOS', 'NA', '576.866774966349']
['22', '1', 'iOS', 'Female', '850']
['24', '2', 'iOS', 'Female', '1050']
['1', '0', 'Android', 'Male', '0']
['13', '1', 'Android', 'Male', '460']
['23', '2', 'iOS', 'Male', '1850']
['14', '1', 'Android', 'Male', '480']
['11', '1', 'Android', 'Male', '110']
['24', '2', 'iOS', 'Male', '1950']
['17', '1', 'Android', 'Male', '225']
['14', '1', 'Android', 'NA', '344.651613803221']
['2', '0', 'Android', 'Male', '0']
['8', '1', 'Android', 'NA', '344.651613803221']
['24', '2', 'iOS', 'Male', '1850']

After imputation:
['7', '0', 'Android', 'Male', '0']
['20', '1', 'iOS', 'Male', '576.866774966349']
['22', '1', 'iOS', 'Female', '850']
['24', '2', 'iOS', 'Female', '1050']
['1', '0', 'Android', 'Male', '0']
['13', '1', 'Android', 'Male', '460']
['23', '2', 'iOS', 'Male', '1850']
['14', '1', 'Android', 'Male', '480']
['11', '1', 'Android', 'Male', '110']
['24', '2', 'iOS', 'Male', '1950']
['17', '1',


# Impute missing transaction data

It's worth noting that there are no outliers here, so we can probably use the average ormedian safely:

In [259]:
# extract transaction data, store counts in list
transactions = [row[1] for row in complete_data]

transaction_counts = {}

for item in transactions:
    if item not in transaction_counts:
        transaction_counts[item] = 1
    else:
        transaction_counts[item] += 1

# get stats from data
average_transactions = np.mean(transactions_list)
median_transactions = np.median(transactions_list)
print("Entries wi")
print("Transaction counts:", transaction_counts)
print("Average transactions:", average_transactions)
print("Median transactions:", median_transactions)

Transaction counts: {'0': 2913, '1': 9908, '2': 2779}
Average transactions: 0.99141025641
Median transactions: 1.0


We will go with the median to avoid introducing floats

### Execute imputation of transaction data

In [295]:
# copy over gender_imputed_data to continue imputation
fully_imputed_data = gender_imputed_data

# replace missing values with the median
for row in fully_imputed_data:
    if row[1] =='NA':
        row[1] = 1

# See if we have any missing data
for row in fully_imputed_data:
    if 'NA' in row:
        print("DANGER WILL ROBINSON")
        
# see if the number is what it should be
print("Entries in the gender imputed data set:", len(fully_imputed_data))
assert(len(gender_imputed_data) == len(data))

Entries in the gender imputed data set: 22800


Let's take a look at some of the data to see if there are any 'NA's just in case

In [276]:
for row in fully_imputed_data[:50]:
    print(row)

['7', '0', 'Android', 'Male', '0']
['20', '1', 'iOS', 'Male', '576.866774966349']
['22', '1', 'iOS', 'Female', '850']
['24', '2', 'iOS', 'Female', '1050']
['1', '0', 'Android', 'Male', '0']
['13', '1', 'Android', 'Male', '460']
['23', '2', 'iOS', 'Male', '1850']
['14', '1', 'Android', 'Male', '480']
['11', '1', 'Android', 'Male', '110']
['24', '2', 'iOS', 'Male', '1950']
['17', '1', 'Android', 'Male', '225']
['14', '1', 'Android', 'Female', '344.651613803221']
['2', '0', 'Android', 'Male', '0']
['8', '1', 'Android', 'Male', '344.651613803221']
['24', '2', 'iOS', 'Male', '1850']
['23', '2', 'iOS', 'Female', '1300']
['18', '2', 'Android', 'Male', '990.306213040332']
['16', 1, 'Android', 'Male', '405.244133595093']
['25', '2', 'iOS', 'Male', '1950']
['17', '1', 'Android', 'Male', '550']
['18', '2', 'iOS', 'Female', '1500']
['14', '1', 'Android', 'Male', '330']
['7', '0', 'Android', 'Male', '0']
['3', 1, 'Android', 'Male', '121.774494043513']
['25', '2', 'iOS', 'Male', '1222.52137420346']


# Split the data into two separate sets 

In [281]:
training_data = []
validation_data = []

iterator = 0
for i in range(len(fully_imputed_data)):
    row_of_interest = fully_imputed_data[iterator]
    # test even or odd
    if iterator % 2 == 0:
        validation_data.append(row_of_interest)
    else:
        training_data.append(row_of_interest)
    iterator += 1
    
print(len(training_data))
print(len(validation_data))
    

11400
11400


DONE-(5 pts) Locate the data set and load the data into R.

DONE-(10 pts) Calculate the following summative statistics:

DONE-total number of cases, 

DONE-mean number of visits,

DONE-median revenue,

DONE-maximum and minimum number of transactions, 

DONE-most commonly used operating system. 

DONE-Exclude any cases where there is a missing value.

(15 pts) Create a scatterplot of number of visits (x-axis) versus revenue (y-axis). Comment on the correlation between the two variables.

(10 pts) Which columns have missing data? How did you recognize them? How would you impute missing values?

DONE-(15 pts) Impute missing transaction and gender values.

DONE-(20 pts) Split the data set into two equally sized data sets where one can be used for training a model and the other for validation. Take every odd numbered case and add them to the training data set and every even numbered case and add them to the validation data set, i.e., row 1, 3, 5, 7, etc. are training data while rows 2, 4, 6, etc. are validation data.

(10 pts) Calculate the mean revenue for each data set and compare them. Comment on the difference.

(15 pts) For many data mining and machine learning tasks, there are packages in R. Find at least one package that has functions for creating training and validation data subsets and show how to use them.


### Add scatterplot

### Mean revenue of training vs. validation data

### Explain ibrary
numpy?
