# Set up

In [1]:
import pandas as pd
import random
import numpy as np

# Load in relevant raw data

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/zaclovell/randomised_dataset_generators/master/game_sales_data.csv")

##### Note: Ensure to check you have the correct raw data

In [3]:
df

Unnamed: 0,postcode,bank_no,digi_or_phys,played_24_after_purchase,purchase_date
0,AB9,1234.0,digital,1.0,01/02/2022
1,CD3,3465.0,physical,0.0,02/02/2022
2,EF5,3095.0,,,03/02/2022
3,HI4,9545.0,,,04/02/2022
4,FG8,2332.0,,,05/02/2022
5,WQ6,6454.0,,,06/02/2022
6,FG8,3345.0,,,07/02/2022
7,SD7,1287.0,,,08/02/2022
8,SD3,4598.0,,,09/02/2022
9,QWF6,,,,10/02/2022


##### Example: Call a column

In [4]:
df["postcode"]

0      AB9
1      CD3
2      EF5
3      HI4
4      FG8
5      WQ6
6      FG8
7      SD7
8      SD3
9     QWF6
10     EF6
11     HI5
12     FG9
13     WQ7
14     FG9
15     FG8
16     SD4
17    QWF7
18     EF7
19     HI6
20    FG10
21     WQ8
22     NaN
Name: postcode, dtype: object

## Turn selected columns into lists ()

In [124]:
purchase_location = df["postcode"].to_list()
game_version = df["digi_or_phys"].to_list()
purchase_date = df["purchase_date"].to_list()

##### Note: You can check the data type to make sure

In [125]:
#type(purchase_location)
#type(game_version)
type(purchase_date)

list

## Due to the nature of the raw data, we need to remove NaNs from each list

In [131]:
# for purchase_location list
purchase_location = [x for x in purchase_location if pd.isnull(x) == False]
# purchase_location

In [132]:
# for game_version list
game_version = [x for x in game_version if pd.isnull(x) == False]
# game_version

## Since we have prepped the raw data, we can now use the function to produce the randomised game sales dataset

In [133]:
def random_game_sales_dataset(num):
    
    '''
    function to pull random game sales dataset
    
    '''
    
    rand_purchase_location = np.random.choice(purchase_location, size=num)
    rand_game_version = np.random.choice(game_version, size=num)
    rand_purchase_date = np.random.choice(purchase_date, size=num)
    
    # COMMENT OUT to check variables below
    # print(rand_purchase_location)
    # print(rand_game_version)
    # print(rand_purchase_date)
        
    # Turning NoneTypes from variables above into columns of a single dataframe
    # this approach is using the dictionary method
    # {"name of column": NoneType variable, *repeat this for desired amount of columns*}
    d = {"purchase_location": rand_purchase_location, "game_version": rand_game_version, "purchase_date": rand_purchase_date}

    new_dataframe = pd.DataFrame(d)
    return new_dataframe

In [134]:
random_game_sales_dataset(50)

Unnamed: 0,purchase_location,game_version,purchase_date
0,EF7,physical,10/02/2022
1,WQ8,physical,01/02/2022
2,EF5,digital,10/02/2022
3,HI6,digital,23/02/2022
4,HI4,physical,10/02/2022
5,HI5,digital,13/02/2022
6,WQ8,digital,13/02/2022
7,SD4,digital,14/02/2022
8,WQ8,physical,20/02/2022
9,FG8,physical,23/02/2022


# To Do

- add in more columns, make the dataset a lot larger
- find way to bring in real data e.g. all UK postcodes, payments from banks etc.

# Saving the dataset as CSV

In [135]:
random_game_sales_dataset(50).to_csv("example_random_game_sales_dataset.csv", index=False)

In [136]:
df_saved_file = pd.read_csv("example_random_game_sales_dataset.csv")
df_saved_file.head()

Unnamed: 0,purchase_location,game_version,purchase_date
0,AB9,digital,12/02/2022
1,EF6,digital,17/02/2022
2,QWF6,physical,03/02/2022
3,EF6,digital,17/02/2022
4,FG8,physical,17/02/2022
