In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

In [2]:
# Read in CSV File
document_check = 'document_check_sample.csv'
facial_report = 'facial_report_sample.csv'

document_df = pd.read_csv(document_check)
facial_df = pd.read_csv(facial_report)


## Understand the attributes of data frames: 
- What are the attributes 
- What are their unique values 

In [3]:
# Understand the attributes
document_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11760 entries, 0 to 11759
Data columns (total 19 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Unnamed: 0                          5880 non-null   float64
 1   user_id                             5880 non-null   object 
 2   result                              5880 non-null   object 
 3   visual_authenticity_result          4991 non-null   object 
 4   image_integrity_result              5880 non-null   object 
 5   face_detection_result               4990 non-null   object 
 6   image_quality_result                5880 non-null   object 
 7   created_at                          5880 non-null   object 
 8   supported_document_result           5869 non-null   object 
 9   conclusive_document_quality_result  3140 non-null   object 
 10  colour_picture_result               3140 non-null   object 
 11  data_validation_result              4739 

In [4]:
# Understand the attributes
facial_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11760 entries, 0 to 11759
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Unnamed: 0                     5880 non-null   float64
 1   user_id                        5880 non-null   object 
 2   result                         5880 non-null   object 
 3   face_comparison_result         5535 non-null   object 
 4   created_at                     5880 non-null   object 
 5   facial_image_integrity_result  5861 non-null   object 
 6   visual_authenticity_result     4991 non-null   object 
 7   properties                     5880 non-null   object 
 8   attempt_id                     5880 non-null   object 
dtypes: float64(1), object(8)
memory usage: 827.0+ KB


## Dropping Nan vales 
- Based on the information above, there are 11760 entries, however there are only 5880 rows with result. Which means we canont make any assumption out of the rows with no result
- Hence, I will decide to drop all those rows

In [5]:
# Dropping the rows with Na values for result
document_df = document_df.dropna(subset=['result']).reset_index(drop=True)
facial_df = facial_df.dropna(subset=['result']).reset_index(drop=True)



## Handling document file

In [6]:
# Remmove unrelevant attributes

document_importantAttr = [
 'result', 
 'visual_authenticity_result',
 'image_integrity_result',
 'face_detection_result',
 'image_quality_result',
 'created_at',
 'supported_document_result',
 'conclusive_document_quality_result',
 'colour_picture_result',
 'data_validation_result',
 'data_consistency_result',
 'data_comparison_result',
 'police_record_result',
 'compromised_document_result',]

In [7]:
# Currently, the date time is object so it will not be sorted correctly. We will convert that to date-time format

document_df['created_at'] = pd.to_datetime(document_df['created_at'], format='%d/%m/%Y %H:%M')
document_df['created_at'].info()


<class 'pandas.core.series.Series'>
RangeIndex: 5880 entries, 0 to 5879
Series name: created_at
Non-Null Count  Dtype         
--------------  -----         
5880 non-null   datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 46.1 KB


In [8]:
# Since the result recently decreased, sort the dataset by date 'created_at' 

document_filtered = document_df[document_importantAttr]
document_filtered = document_filtered.sort_values(by='created_at').reset_index(drop=True)

In [9]:
# Check the unique values in all attributes 

for attribute in document_importantAttr:
    unique_values = document_filtered[attribute].unique()
    print(f"Unique values for {attribute}:\n{unique_values}\n")


Unique values for result:
['clear' 'consider']

Unique values for visual_authenticity_result:
['clear' 'consider' nan]

Unique values for image_integrity_result:
['clear' 'consider']

Unique values for face_detection_result:
['clear' nan 'consider']

Unique values for image_quality_result:
['clear' 'unidentified']

Unique values for created_at:
['2017-05-23T17:25:00.000000000' '2017-05-23T19:17:00.000000000'
 '2017-05-23T20:49:00.000000000' ... '2017-10-31T21:11:00.000000000'
 '2017-10-31T21:46:00.000000000' '2017-10-31T22:06:00.000000000']

Unique values for supported_document_result:
['clear' nan 'unidentified']

Unique values for conclusive_document_quality_result:
[nan 'clear' 'consider']

Unique values for colour_picture_result:
[nan 'clear' 'consider']

Unique values for data_validation_result:
['clear' nan 'consider']

Unique values for data_consistency_result:
[nan 'clear' 'consider']

Unique values for data_comparison_result:
['clear' 'consider' nan]

Unique values for police_

In [10]:
document_filtered

Unnamed: 0,result,visual_authenticity_result,image_integrity_result,face_detection_result,image_quality_result,created_at,supported_document_result,conclusive_document_quality_result,colour_picture_result,data_validation_result,data_consistency_result,data_comparison_result,police_record_result,compromised_document_result
0,clear,clear,clear,clear,clear,2017-05-23 17:25:00,clear,,,clear,,clear,clear,
1,clear,clear,clear,clear,clear,2017-05-23 19:17:00,clear,,,clear,clear,clear,clear,
2,clear,clear,clear,clear,clear,2017-05-23 20:49:00,clear,,,clear,clear,clear,clear,
3,consider,clear,clear,clear,clear,2017-05-24 07:47:00,clear,,,clear,clear,consider,clear,
4,clear,clear,clear,clear,clear,2017-05-24 11:38:00,clear,,,clear,,clear,clear,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5875,consider,,consider,,unidentified,2017-10-31 21:06:00,clear,,,,,,,
5876,consider,,consider,,unidentified,2017-10-31 21:10:00,clear,,,,,,,
5877,clear,clear,clear,clear,clear,2017-10-31 21:11:00,clear,clear,clear,clear,clear,,clear,clear
5878,clear,clear,clear,clear,clear,2017-10-31 21:46:00,clear,clear,clear,,,,clear,clear


In [11]:
# Set 'created_at' as the index
document_filtered.set_index('created_at', inplace=True)

# Group by month and apply value_counts for each column
monthly_stats = document_filtered.resample('M').apply(lambda x: x.apply(lambda y: y.value_counts())).unstack(fill_value=0)

# Reset the index to make 'created_at' a column again
monthly_stats.reset_index(inplace=True)



In [12]:
monthly_stats

Unnamed: 0_level_0,created_at,result,result,result,visual_authenticity_result,visual_authenticity_result,visual_authenticity_result,image_integrity_result,image_integrity_result,image_integrity_result,...,data_consistency_result,data_comparison_result,data_comparison_result,data_comparison_result,police_record_result,police_record_result,police_record_result,compromised_document_result,compromised_document_result,compromised_document_result
Unnamed: 0_level_1,Unnamed: 1_level_1,clear,consider,unidentified,clear,consider,unidentified,clear,consider,unidentified,...,unidentified,clear,consider,unidentified,clear,consider,unidentified,clear,consider,unidentified
0,2017-05-31,56.0,2.0,0.0,58.0,,0.0,58.0,,0.0,...,0.0,56.0,2.0,0.0,49.0,,0.0,,,0.0
1,2017-06-30,267.0,9.0,,272.0,3.0,,275.0,1.0,,...,,24.0,3.0,,238.0,,,,,
2,2017-07-31,1013.0,120.0,,1019.0,18.0,,1037.0,96.0,,...,,,,,1008.0,,,,,
3,2017-08-31,1017.0,211.0,,1034.0,29.0,,1051.0,177.0,,...,,,,,1026.0,,,,,
4,2017-09-30,901.0,352.0,,1012.0,21.0,,930.0,323.0,,...,,,,,986.0,,,133.0,,
5,2017-10-31,1152.0,780.0,,1499.0,26.0,,1192.0,740.0,,...,,,,,1484.0,,,1391.0,,
