The purpose of this Notebook is to create the fake data for the `timeseries_data_cleaning` Notebook.


In [1]:
import numpy as np
import pandas as pd
import string
import random

from faker import Faker
from pathlib import Path


In [2]:

fake = Faker()
fake.seed_instance(1)
random.seed(1)

In [3]:

# Have a list of cities

# Dates should be from 2020-2021 some percentage should be just wrong
  # In the future
  # Far distant past
  # Nonsensical, i.e. 2020-14-12, or containing numbers



# Temps should be celsisus (converted from F readings) -- but some percentage should be unconverted


In [4]:

res = [random.randrange(0,100,1) for i in range(100)]

In [5]:
df = pd.DataFrame(np.array(res),columns=['randint'])
df.describe()


Unnamed: 0,randint
count,100.0
mean,53.81
std,30.060779
min,0.0
25%,29.0
50%,56.0
75%,82.25
max,99.0


In [6]:

df.head()

Unnamed: 0,randint
0,17
1,72
2,97
3,8
4,32


In [7]:
df_s = df.sample(frac=0.1)

df = df.drop(df_s.index)
df_s.describe()

Unnamed: 0,randint
count,10.0
mean,38.9
std,29.553906
min,1.0
25%,17.25
50%,37.5
75%,55.25
max,86.0


In [8]:
choices = ['A','B','C','D']
# choice_draws = [random.choices(choices, cum_weights=(25,25,25,25), k=4) for x in range(100000)]
choice_draws =[]
for i in range(10000):
  item  = random.choices(choices, cum_weights=(.5,50,75,100))
  choice_draws.append(item)
df_choices = pd.DataFrame(choice_draws, columns=['draw'])
df_choices.value_counts()

draw
B       4964
D       2505
C       2473
A         58
dtype: int64

In [9]:
(60-32) *5/9

15.555555555555555

In [10]:
cities = ['Berkeley',
          'Oakland',
          'San Leandro',
          'San Francisco',
          'Richmond',
          'Antioch',
          'Albany',
          'Novato',
          'Morgan Hill',
          'San Jose'
          ]

class GenerateFakes:
  def __init__(self,
               count,
               far_past_weight=1,
               past_weight=99,
               future_weight=100,
               celsius_weight=0.5
               ):
    self.count = count
    self.far_past_weight = far_past_weight
    self.past_weight = past_weight
    self.future_weight = future_weight
    self.celsius_weight = celsius_weight
    self.fake = Faker()


  def date_selector(self):
    # Get the weights of for date errors
    all_weights = [self.far_past_weight, self.past_weight, self.future_weight]
    weights = (self.far_past_weight, self.past_weight, self.future_weight)
    choices = [self.fake.past_date(start_date="-200y"),
               self.fake.past_date(start_date="-10y"),
               self.fake.future_date()
              ]
    fake_date = random.choices(choices, cum_weights=weights)
    return fake_date


  def date_maker(self):
    """
    Set up dates in the far past, recent past, and future
    """
    fake_dates = []
    for i in range(self.count):
      fake_dates.append(self.date_selector()[0])
    return fake_dates

  def city_chooser(self, city_list=cities):
    fake_cities = []
    for i in range(self.count):
      city = random.choice(city_list)
      fake_cities.append(city)
    return fake_cities


  # Generate a number around 100 for some of the count
  def temp_adder(self,affected=0.04):
    choices = [random.expovariate(0.01),0]
    temp_changes = []
    for i in range(self.count):
      temp_changes.append((random.choices(choices,cum_weights=(affected,100-affected))[0]))
    return temp_changes

  def f_to_c(self, x):
     """
     x is temp in degrees F
     """
     c_temp = (x-32) * 5/9
     return c_temp

  def dummy_f_to_c(self,x):
    return x

  # Farenheit to Celsius for a fixed percent
  def celsius_maker(self, temps_to_convert):
    """
    Converts a fraction of temps from Farenheit to Celsius
    """
    weights = (self.celsius_weight, 100)
    choices = [self.dummy_f_to_c, self.f_to_c]
    converted_temps = []
    for i in temps_to_convert:
      func = random.choices(choices,cum_weights=weights)
      converted_temps.append(func[0](i))
    return converted_temps


  # office building temps
  def temperature_maker(self):
    """
    Returns list of temps
    """
    # Get distribution for temps
    f_temps = [round(random.normalvariate(mu=68, sigma=3.0),2) for x in range(self.count)]
    # Randomly add a number around 100, 0.5 percent of the time
    # Convert arrays to numpy
    f_temps_np = np.array(f_temps)
    temp_adds_np = np.array(self.temp_adder())
    # add temperatures
    all_f_temps = f_temps_np + temp_adds_np
    # Convert almost all to Celsius
    # Convert out of np array to regular array
    all_f_temps = list(all_f_temps)
    all_c_temps = self.celsius_maker(temps_to_convert=all_f_temps)
    return all_c_temps



class GenerateDataFrame:
   def __init__(self,
                fraction_to_scramble=None,
                **kwargs):
     """
     Pass the functions that generate the dataframe columns
     {<column name A>:<function that makes column>, }
     """
     self.__dict__.update(kwargs)
     self.fraction_to_scramble = fraction_to_scramble

   def randomize_string(self,x):
    """
    Function to randomize a string
    """
    str_x = str(x)
    l = list(str_x)
    random.shuffle(l)
    randomized_string = ''.join(l)
    return randomized_string

   def make_results(self):
     """
     call all kwarg columns
     """
     output = {}
     for col_name, func in self.__dict__.items():
       if col_name == 'fraction_to_scramble':
        continue
       else:
        output[col_name] = func()
     return output

   def generate_raw_df(self):
    # Call the column functions
    column_data = self.make_results()
    df =  pd.DataFrame.from_dict(column_data)
    return df

   def scrambler(self, df):
    """
    Scramble a sample of the dataframe
    """
    # number_to_scramble = int(len(df) * self.fraction_to_scramble)
    # Take a sample from the dataframe without replacement
    sample_to_scramble = df.sample(frac=self.fraction_to_scramble)
    # drop the items pulled from the sample
    df = df.drop(sample_to_scramble.index)
    # apply the randomizer
    for column in [* sample_to_scramble.columns]:
      sample_to_scramble[column] = sample_to_scramble[column].apply(self.randomize_string)
    # concat the scrambled data to the existing df
    output = pd.concat([df,sample_to_scramble])
    # Output the concatenated dataframe.
    return output

   def pipeline(self):
     df = self.generate_raw_df()
     scrambled_df = self.scrambler(df)
     output = scrambled_df
     return output







In [11]:
fakeCols = GenerateFakes(count=10000)
# maker_objs = {"date": fakeCols.date_maker,
#               "city": fakeCols.city_chooser,
#               "temp_c": fakeCols.temperature_maker
#               }
dfMaker = GenerateDataFrame(fraction_to_scramble=0.01,
                            date=fakeCols.date_maker,
                            city=fakeCols.city_chooser,
                            temp_c=fakeCols.temperature_maker)


In [None]:
dfMaker.temp_a

In [12]:
fake_data = dfMaker.pipeline()

In [13]:
fake_data.head()

Unnamed: 0,date,city,temp_c
0,2013-12-04,Morgan Hill,20.544444
1,2018-10-24,San Leandro,19.738889
2,2022-09-03,San Jose,21.077778
3,2014-07-31,Richmond,18.222222
4,2015-02-07,San Jose,21.138889


In [15]:
# Write fake data to an excel file
output_file_name = 'temps.xlsx'
current_dir_path = Path.cwd()
path = Path(current_dir_path,output_file_name)
fake_data.to_excel(path,
                   sheet_name='Building Temps',
                   float_format="%.2f")