# UFCFVQ-15-M Programming for Data Science (Autumn 2021)
# Programming Task 1

## Student Id: <span style="color: yellowgreen">2104988</span>

<p style="color:red; font-weight:bold; font-size:xx-small">OVERALL COURSEWORK MARK: ___%</p>

### Requirement FR1 - Develop a function to find the arithmetic mean

In [1]:
def _constrain(arr):
  """
  A utility function used to ensure the values in the pertinent 
  array are numbers.
  Input: list of values under consideration
  Output: constrained array of such values
  """
  assert all(                               
    type(i) is int 
    or type(i) is float 
    or i is None 
    for i in arr
  ), "arr elements should be int or float"

  arr = [i for i in arr if i is not None]   # remove all null entries
  return arr

# FR1
def arithmetic_mean(arr):
  arr = _constrain(arr)
  return sum(arr)/len(arr)

# test on sample
sample = [6, 2, 22, 21, 24, 23, 8, 9, 5, 3, 11, 28, 14, 13, 12, 26, 29, 16, 15, 20, 25, 1, 10, 18, 27, 19, 7]
print(f'sample mean: {round(arithmetic_mean(sample), 2)}')

sample mean: 15.33


<p style="color:red; font-weight:bold; font-size:xx-small">MARK: __%</p>
<p style="color:red; font-weight:bold; font-size:xx-small">FEEDBACK: </p>

### Requirement FR2 - Develop a function to find the standard deviation

In [2]:
# implements the mean standard deviation
def standard_deviation(arr):
  arr = _constrain(arr)
  mu = arithmetic_mean(arr)
  mse = sum((x - mu)**2 for x in arr) / len(arr)
  return mse**0.5 

# test on sample
print(f'sample standard deviation: {round(standard_deviation(sample),2)}')

sample standard deviation: 8.39


<p style="color:red; font-weight:bold; font-size:xx-small">MARK: __%</p>
<p style="color:red; font-weight:bold; font-size:xx-small">FEEDBACK: </p>

### Requirement FR3 - Develop a function to find the min/max values and count 

In [3]:
def info(arr):
  arr = _constrain(arr)
  return min(arr), max(arr), len(arr)

# test
stats = dict(zip(('sample min', 'sample max', 'sample count'), info(sample)))
for key, val in stats.items():
  print(f'{key}: {val}')

sample min: 1
sample max: 29
sample count: 27


<p style="color:red; font-weight:bold; font-size:xx-small">MARK: __%</p>
<p style="color:red; font-weight:bold; font-size:xx-small">FEEDBACK: </p>

### Requirement FR4 - Develop a function to find the 25th, 50th and 75th percentiles 

In [4]:
def percentiles(arr):
  arr = _constrain(arr)

  def _calc_percentile(pos, arr):
    # sort list
    arr = sorted(arr) # used sorted() instead of list.sort() to maintain original data

    # return the last element for the 100th percentile
    if pos == 100:
      return arr[-1]

    # get index of percentile in the array
    index = (pos / 100 * len(arr))

    # adjust for whole number or fractional index
    if index.is_integer(): # whole number adjustment
      index = int(index) # convert index to integer if its whole
      percentile = sum((arr[index-1], arr[index])) / 2
    else: # fractional adjustment
      # round() rounds to the nearest even number so adjust index
      # for values with .5 where the integer part is even. eg. 6.5
      # will be rounded to 7 instead of 6
      index = (index + 0.1) if (index % 0.5) == 0 and ((index-0.5) % 2 == 0) else index
      percentile = arr[round(index)-1]

    return percentile

  # calculate 25th, 50th and 75th percentiles
  quartile_1 = _calc_percentile(25, arr)
  median = _calc_percentile(50, arr)
  quartile_3 = _calc_percentile(75, sorted(arr))
  return quartile_1, median, quartile_3

# test
stats = dict(zip(('25th', '50th', '75th'), percentiles(sample)))
for key, val in stats.items():
  print(f'{key}: {val}')

25th: 8
50th: 15
75th: 22


<p style="color:red; font-weight:bold; font-size:xx-small">MARK: __%</p>
<p style="color:red; font-weight:bold; font-size:xx-small">FEEDBACK: </p>

### Requirement FR5 - Develop a function to read a single column from a CSV file

In [5]:
def read_column(filename:str, col:int)->tuple:
  """
  Read single column from csv
  Input:
    @param filename: path to the csv file
    @param col: number of the column to be read
  Output:
    @return (label, data): label -> name of column, data -> values under column
  """
  assert type(col) is int, "col should be an integer"
  # read file content into memory
  with open(filename, 'r') as f:
    lines = f.readlines()

  # remove return character at the end of lines and 
  # build a line generator
  lines = (line.strip().split(',') for line in lines)

  try:
    # read column data from each line
    column = [line[col] if (col < len(line) or col >= 0) else None  for line in lines]
  except IndexError as err:
    print(err)
    return
  
  # label is the column name, found on the first line. the remainder of the data represents
  label, data = (column[0], column[1:-1])
  # parse string value into appropriate data type
  data = [
    int(i) if i.isnumeric()                 # case integer
    else float(i) if (len(i.split('.'))==2) # case float
    else None if (i=='')                    # case null
    else i for i in data                    # case string
  ]
  return label, data
  
# reading column 2 (Wind)
col, val = read_column('task1.csv', 2)
print(f'column: {col}\nvalues: {val}')

column: Wind
values: [0.01, 1.412699784, 16.266484, 5.900014713, 0.0822, 0.005086066, 0.117978175, 7.468573702, 48.47990705, 1.378292235, 0.977082975, 32.17005152, 3.688710149, 366, 0.043437355, 1.379085419, 0.22061, 0.6093, 13.89880328, 0.080257771, 2.438966811, 0.633888557, 0.531134615, 5.857, 28.16738103, 111.59, 6.300269103, 0.000825, 0.608, 0.004365699, 60.31116, 0.198, 0.361973742, 0, 8.390989769, 0.105269485, 17.47251773, 6.789652, 0.5412648, 0.017452186, 0.123287671, 1.145, 0.307147387, 0.097338, 0, 12.62920683, 3.8407, 10.549225, 2.046003874, 3.875881, 0, 0.497064956, 0.0079, 1.428156269, 0.574940541, 0.453, 4.80031359, 0, 1.72268087, 1.493633864, 1.152924, 12.845, 12.657, 0, 6.497, 0.22774, 0, 0, 0.005, 0.006159, 6.894765181, 2.40369047, 50.83600814, 0.305662189, 16.81339144, 0.133, 1.6794, 0.76415852, 0, 19.825697, 0, 1.1274, 0.00075, 57.11571962, 277.7291061, 0, 0]


<p style="color:red; font-weight:bold; font-size:xx-small">MARK: __%</p>
<p style="color:red; font-weight:bold; font-size:xx-small">FEEDBACK: </p>

### Requirement FR6 - Develop a function to read CSV data from a file into memory

In [6]:
def read_csv(filename:str)->dict:
  """
  Read data from csv column by column
  Input:
    @param filename: path to the csv file to be read
  Output:
    @return data: a dictionary of the form column: [data]
  """

  # get the number of columns in the csv
  with open(filename, 'r') as f:
    line = f.readline().strip()
  ncols = len(line.split(',')) # number of columns

  # read file column by column into data frame
  data = {}
  for i in range(ncols):
    col_label, col_data = read_column(filename, i)
    data[col_label] = col_data

  return data

# reading whole csv
whole_csv = read_csv('task1.csv')
for col, val in whole_csv.items():
  print(f'column: {col}\n{val}')

column: Hydropower
[0.117, 41.64945459, 17.26791, 37.52781287, 1.76795, 0.885632595, 0.45726487, 0.284810423, 387.6868063, 5.125584584, 244.839096, 387.2508762, 23.14438369, 1202.43, 56.64701407, 7.512550207, 0, 1.6272, 0.016454789, 20.69612073, 13.48338, 0.015617162, 38.93203591, 13.27777778, 64.2234218, 16.9, 5.720179713, 0, 0.221, 13.81099543, 139.6678322, 16.4112, 10.77, 3.1074588, 0.694087982, 0.0245, 45.85205715, 80.997, 10.3295118, 0, 2.433342329, 0.45, 0.094557415, 1.791368, 24.17649672, 32.39511459, 1.6934, 0.072348, 26.32037732, 138.4962175, 0, 62.91198244, 35.20305729, 2.255678699, 1.288541272, 0.0165, 69.46684307, 1.852952842, 35.99700727, 30.76868807, 9.383758, 1.971, 12.346, 0, 17.56396055, 190.21311, 0, 0, 3.591, 4.712487886, 0.914864856, 2.920390756, 35.19825724, 6.369370161, 61.91687566, 34.80394017, 4.4661, 7.597013, 0, 59.51828075, 0, 9.900734781, 0, 5.464660531, 288.7063404, 6.868202, 72.08394993]
column: Solar
[0.603, 0.108128528, 12.081099, 1.578641473, 0.03926, 0

<p style="color:red; font-weight:bold; font-size:xx-small">MARK: __%</p>
<p style="color:red; font-weight:bold; font-size:xx-small">FEEDBACK: </p>

### Requirement FR7 - Develop a function to generate a set of statistics for a given data file

In [7]:
def statistics(dataframe:dict)->dict:
  """
  Generate statistics for data in memory
  Input:
    @param dataframe: a dictionary object holding the data read into memory
  Outout:
    @return stats_frame: a dictionary object holding the statictics on the input data
  """

  # a helper to functionally generate statictics on data. Takes a column and retuns
  # the name and statistics of that column
  def stat_gen(column):
    name, values = column
    _min, _max, count = info(values)
    mean = arithmetic_mean(values)
    stdev = standard_deviation(values)
    quartile1, quartile2, quartile3 = percentiles(values)
  
    stats = [count, round(mean,6), round(stdev,6), _min, round(quartile1,6), 
            round(quartile2,6), round(quartile3,6), _max]
    return (name, stats) 

  # add statistic names into the stats_frame
  stats_frame = {"stats": ["Count","Mean","Stdev","Min","25th","50th","75th","Max"]}
  result = dict(map(stat_gen, dataframe.items())) # generate statistics for all columns
  
  stats_frame = {**stats_frame, **result}  # merge generated statistics into the stats_frame
  return stats_frame

# test
stats_frame = statistics(whole_csv)
for col, val in stats_frame.items():
  print(f'{col}: {val}')

stats: ['Count', 'Mean', 'Stdev', 'Min', '25th', '50th', '75th', 'Max']
Hydropower: [87, 48.45479, 144.393637, 0, 0.694088, 7.51255, 35.198257, 1202.43]
Solar: [87, 6.692888, 23.17154, 0, 0.08, 0.585724, 2.3386, 177.5]
Wind: [87, 14.541606, 50.437145, 0, 0.080258, 0.977083, 6.497, 366]
Other: [86, 7.094786, 15.986218, 0, 0.109149, 0.932573, 6.331, 90.72346175]


<p style="color:red; font-weight:bold; font-size:xx-small">MARK: __%</p>
<p style="color:red; font-weight:bold; font-size:xx-small">FEEDBACK: </p>

### Requirement FR8 - Develop a function to print a custom table

In [8]:
def view(stats:dict, cols:[] or ()=None, sep:str=None, pad:int=4)->None:
  """
  Display the data statistics as a 2-D grid.
  Input: 
    @param stats: dictionary generated by statistics()
    @param cols:  list or tuple the names of columns to show. If none, returns all columns
    @param sep: a character/sring to be used for grid border.
    @param pad: an integer x in the range 1 <= x < 10. Default is 4
  Output: 
    @returns None
    Prints 2D table of statictics to console
  """
  # filter columns
  if cols is not None:
    assert type(cols) is list or type(cols) is tuple \
    and all(col in stats.keys() for col in cols), \
      "ensure all specified columns are in the dataset"
    stats = {key: value for key, value in stats.items() if key in cols or key == 'stats'}

  # determine separator
  if sep is None:
    # default uses three separators: - for horizontal, | for vertical and + from the cross point
    hor, ver, jxn = ('-', '|', '+')
  else:
    assert type(sep) is str, "seperator should be a string"
    hor, ver, jxn = (sep, sep, sep) # set all separators to the one specified

  # verify that specified padding is in range
  assert type(pad) is int and (pad >= 1) and (pad < 10), \
    "padding should be an integer in the range 1-9"
  
  numcols = len(stats.keys()) # no. of colums in the table

  # list of the width of each column
  width = [pad + (max(len(str(i)) for i in stats[k])) for k in stats.keys()]
  # horizontal border
  line = jxn + jxn.join([hor*(width[i]) for i in range(numcols)]) + jxn

  # helper to flatten columns data. ie. to convert column name-values pair to 
  # a single list representing the column 
  def flatten(key, value):
    value = value.copy() # get copy of values in column so original data is not mutated
    value.insert(0, key) # add column name to the front of the values
    return value # return flattened column

  columns = [flatten(key, stats[key]) for key in stats.keys()]
  rows = [[arr[i] for arr in columns] for i in range(8)]

  # remove 'stats' from the headers
  rows[0][0] = ''

  print(line.replace(jxn+hor*width[0], ' '*(width[0]+1), 1)) # top horizontal border

  # build row strings with column seperators
  for row in rows:
    print(
      (' ' if row[0]=='' else ver) + 
      ver.join(
        str(row[i]).center(width[i]) +
        (ver if i==len(row)-1 else '')
        for i in range(len(row))
      )
    )
    # add header bottom border
    if row[0] == '':
      print(line)

  print(line + '\n') # bottom horizontal border
  return

# test
view(stats_frame)

          +--------------+------------+-------------+---------------+
          |  Hydropower  |   Solar    |     Wind    |     Other     |
+---------+--------------+------------+-------------+---------------+
|  Count  |      87      |     87     |      87     |       86      |
|   Mean  |   48.45479   |  6.692888  |  14.541606  |    7.094786   |
|  Stdev  |  144.393637  |  23.17154  |  50.437145  |   15.986218   |
|   Min   |      0       |     0      |      0      |       0       |
|   25th  |   0.694088   |    0.08    |   0.080258  |    0.109149   |
|   50th  |   7.51255    |  0.585724  |   0.977083  |    0.932573   |
|   75th  |  35.198257   |   2.3386   |    6.497    |     6.331     |
+---------+--------------+------------+-------------+---------------+



In [9]:
# custom seperator
view(stats_frame, sep='=')

          =  Hydropower  =   Solar    =     Wind    =     Other     =
=  Count  =      87      =     87     =      87     =       86      =
=   Mean  =   48.45479   =  6.692888  =  14.541606  =    7.094786   =
=  Stdev  =  144.393637  =  23.17154  =  50.437145  =   15.986218   =
=   Min   =      0       =     0      =      0      =       0       =
=   25th  =   0.694088   =    0.08    =   0.080258  =    0.109149   =
=   50th  =   7.51255    =  0.585724  =   0.977083  =    0.932573   =
=   75th  =  35.198257   =   2.3386   =    6.497    =     6.331     =



In [10]:
# filtered columns
view(stats_frame, sep='=', cols=['Hydropower', 'Solar'])

          =  Hydropower  =   Solar    =
=  Count  =      87      =     87     =
=   Mean  =   48.45479   =  6.692888  =
=  Stdev  =  144.393637  =  23.17154  =
=   Min   =      0       =     0      =
=   25th  =   0.694088   =    0.08    =
=   50th  =   7.51255    =  0.585724  =
=   75th  =  35.198257   =   2.3386   =



<p style="color:red; font-weight:bold; font-size:xx-small">MARK: __%</p>
<p style="color:red; font-weight:bold; font-size:xx-small">FEEDBACK: </p>

# Coding Standards
<p style="color:red; font-weight:bold; font-size:xx-small">MARK: __%</p>
<p style="color:red; font-weight:bold; font-size:xx-small">FEEDBACK: </p>

# Process Development Report for Task 1


Add text here

<p style="color:red; font-weight:bold; font-size:xx-small">MARK: __%</p>
<p style="color:red; font-weight:bold; font-size:xx-small">FEEDBACK: </p>