# Read the file

In [1]:
%%writefile testutility.py
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime 
import gc
import re


################
# File Reading #
################

def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)


def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string) 
    return string

def col_header_val(df,table_config):
    '''
    replace whitespaces in the column
    and standardized column names
    '''
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace('[^\w]()#%&*','_',regex=True)
    df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
    df.columns = list(map(lambda x: replacer(x,'_'), list(df.columns)))
    expected_col = list(map(lambda x: x.lower(),  table_config['columns']))
    expected_col.sort()
    df.columns =list(map(lambda x: x.lower(), list(df.columns)))
    df = df.reindex(sorted(df.columns), axis=1)
    if len(df.columns) == len(expected_col) and list(expected_col)  == list(df.columns):
        print("The numbers of columns expected: " + str(len(expected_col)))
        print("The numbers of columns ingested: " + str(len(df.columns)))
        print("column name and column length validation passed")
        return 1
    else:
        print("The numbers of columns expected: " + str(len(expected_col)))
        print("The numbers of columns ingested: " + str(len(df.columns)))
        print("column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file",mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded",missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')
        return 0

Overwriting testutility.py


# write the yaml file

In [2]:
%%writefile file.yaml
file_type: csv
dataset_name: testfile
file_name: test_data
table_name: edsurv
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns: 
    - First_name
    - Last_name
    - Gender
    - Company


Overwriting file.yaml


In [3]:
import testutility as util
config_data = util.read_config_file("file.yaml")
config_data

{'file_type': 'csv',
 'dataset_name': 'testfile',
 'file_name': 'test_data',
 'table_name': 'edsurv',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'columns': ['First_name', 'Last_name', 'Gender', 'Company']}

# create test file

In [4]:
# creating test file for this
testdata = {
    'First_name' : ['Gus', 'Orel', 'Wilhelmina','Julius','Linet'],
    'Last_name' : ['Fooks', 'Pigny', 'Lackeye','Coldman','Chapiro'],
    'Gender' : ['Male','Female','Female','Male','Female'],
    'Email' : ['gfooks0@bizjournals.com', 'opigny1@google.co.uk', 'wlackeye2@163.com', 'jcoldman3@huffingtonpost.com', 'lchapiro4@shop-pro.jp']
}
import pandas as pd
df = pd.DataFrame(testdata, columns=['First_name', 'Last_name','Gender', 'Email'])
df.to_csv("test_data.csv",index=False)

# data ingestion

In [5]:
# Normal reading process of the file
import pandas as pd
df_sample = pd.read_csv("test_data.csv",delimiter=',')
df_sample.head()

Unnamed: 0,First_name,Last_name,Gender,Email
0,Gus,Fooks,Male,gfooks0@bizjournals.com
1,Orel,Pigny,Female,opigny1@google.co.uk
2,Wilhelmina,Lackeye,Female,wlackeye2@163.com
3,Julius,Coldman,Male,jcoldman3@huffingtonpost.com
4,Linet,Chapiro,Female,lchapiro4@shop-pro.jp


In [6]:
# read the file using config file
file_type = config_data['file_type']
source_file = "./" + config_data['file_name'] + f'.{file_type}'
#print("",source_file)
df = pd.read_csv(source_file,config_data['inbound_delimiter'])
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,First_name,Last_name,Gender,Email
0,Gus,Fooks,Male,gfooks0@bizjournals.com
1,Orel,Pigny,Female,opigny1@google.co.uk
2,Wilhelmina,Lackeye,Female,wlackeye2@163.com
3,Julius,Coldman,Male,jcoldman3@huffingtonpost.com
4,Linet,Chapiro,Female,lchapiro4@shop-pro.jp


In [8]:
print("columns of files are:" ,df.columns)
print("columns of YAML are:" ,config_data['columns'])
if util.col_header_val(df,config_data)==0:
    print("validation failed")
    # write code to reject the file
else:
    print("col validation passed")
    # write the code to perform further action
    # in the pipleine

columns of files are: Index(['first_name', 'last_name', 'gender', 'email'], dtype='object')
columns of YAML are: ['First_name', 'Last_name', 'Gender', 'Company']
The numbers of columns expected: 4
The numbers of columns ingested: 4
column name and column length validation failed
Following File columns are not in the YAML file ['email']
Following YAML columns are not in the file uploaded ['company']
validation failed
