# Benford Digits 1 & 2

### 4th Aug 2014

In [4]:
# Module Benford.py

import os

### The significand ranges from 1 thru 9 inclusive (0 is excluded)

FIRST_DIGIT_PERCENTAGES = [
                            30.1,    ## '1'
                            17.6,
                            12.5,
                             9.7,
                             7.9,
                             6.7,
                             5.8,
                             5.1,
                             4.5     ## '9'
                          ]

### 2nd and subsequent digits range from 0 thru 9 inclusive (0 is included)

SECOND_DIGIT_PERCENTAGES = [
                             12.0,  ## '0'
                             11.4,
                             10.9,
                             10.4,
                             10.0,
                              9.7,
                              9.3,
                              9.0,
                              8.8,
                              8.5   ## '9'
                            ]


class BenfordDigit(object):
    '''
    Class for investigating whether given number digits adhere to Benford's Law
    Class Attributes:
        __experiment_name:          <string> unique label for each Benford experiment
        __digit_position:           <int>    significand = 1, 2nd digit, counting from LHS = 2, etc
        __digit_count:              <dict>   tally of numbers encountered in given digit position
        __current_number:           <int>    most recent number reported in digit position, used to increment 
                                             corrsponding entry in digit_count tally
        __number_count:             <int>    sum of all numbers encountered in a given digit_position
        __digit_percent:            <list>   list of percentages corresponding to relative frequencies 
                                             for each number for a given digit_position 
                                             (1-9 for significand, or 0-9 otherwise)
    '''
    
    def __init__(self, experiment_name, digit_position):
        self.__experiment_name = experiment_name
        self.__digit_position = digit_position
        
        self.__digit_count = {
                              '0': 0,
                              '1': 0,
                              '2': 0,
                              '3': 0,
                              '4': 0,
                              '5': 0,
                              '6': 0,
                              '7': 0,
                              '8': 0,
                              '9': 0
                              }
        self.__current_number = '1'
        self.__number_count = 0
        
        self.__digit_percent = []
        
    def __str__(self):
        return ('Experiment: %s   Digit position: %i' % (self.__experiment_name, self.__digit_position))
    
    def __repr__(self):
        return str(self)
    
    @property
    def experiment_name(self):
        return (self.__experiment_name)
    
    @property
    def digit_position(self):
        return (self.__digit_position)
    
    @property
    def digit_count(self):
        # return [(k,v) for k,v in sorted([(k,v) for k,v in self.__digit_count.items()])]
        return sorted([(k,v) for k,v in self.__digit_count.items()])
        
    
    @property
    def current_number(self):
        return self.__current_number
    
    @current_number.setter
    def current_number(self, number):
        self.__current_number = number
        self.__digit_count[self.__current_number] += 1       
        
        
    @property
    def number_count (self):
        if self.__digit_position == 1:
            # return all numbers excl. 0's
            return sum(value for key, value in self.__digit_count.items() if key != '0')
        else: 
            return sum(value for key, value in self.__digit_count.items()) ## Needs to be simplified!
    @property
    def digit_percent (self):
        self.__digit_percent = []  
        if self.__digit_position == 1:
            # return ratio of all numbers excl. 0's
            for i in range(1,10):
                self.__digit_percent.append(round((100.0*self.__digit_count[str(i)]/self.number_count), 1))
            return self.__digit_percent
        else:
            # return ratio of all numbers incl. 0's
            for i in range(0,10):
                self.__digit_percent.append(round((100.0*self.__digit_count[str(i)]/self.number_count), 1))
            return self.__digit_percent
        
      
### Significand
benford1 = BenfordDigit('Local P: Drive Files - Significand', 1)

### 2nd Digit
benford2 = BenfordDigit('Local P: Drive Files - 2nd Digit', 2)

for root, dirs, files in os.walk('P:/'):
    for name in files:
        filename = os.path.join(root, name)
        filesize = os.path.getsize(filename)
        
        digit1Strng = (str(filesize))[0]
        benford1.current_number = digit1Strng
        
        if filesize > 9:
            benford2.current_number = str(filesize)[1]
        
print ('\n\nNo of files inspected:\t', benford1.number_count, '\n')
print ('\nPredicted 1st Digit Ratios:\t', FIRST_DIGIT_PERCENTAGES)        
print ('Observed 1st Digit Ratios:\t', benford1.digit_percent, '\n')
print ('\nPredicted 2nd Digit Ratios:\t', SECOND_DIGIT_PERCENTAGES) 
print ('Observed 2nd Digit Ratios:\t', benford2.digit_percent)




No of files inspected:	 48571 


Predicted 1st Digit Ratios:	 [30.1, 17.6, 12.5, 9.7, 7.9, 6.7, 5.8, 5.1, 4.5]
Observed 1st Digit Ratios:	 [29.1, 17.4, 13.5, 10.7, 7.6, 6.7, 5.8, 5.0, 4.1] 


Predicted 2nd Digit Ratios:	 [12.0, 11.4, 10.9, 10.4, 10.0, 9.7, 9.3, 9.0, 8.8, 8.5]
Observed 2nd Digit Ratios:	 [12.2, 10.4, 10.8, 10.4, 10.4, 9.8, 10.0, 8.7, 8.8, 8.6]
