In [104]:
import pandas as pd
import numpy as np
import logging
import subprocess
import gc
import re
from schema import Schema, SchemaError
import os
import yaml

In [105]:
%%writefile testutility.py

def read_yml_file(file_path):
    with open(file_path, 'r') as stream:
        try:
            return yaml.load(stream)
        except yaml.YAMLError as exec:
            logging.error(exec)

def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string)
    return string

def col_validate(df, col_config):
    cols = df.columns
    cols = cols.str.replace(' ', '_')
    cols = list(map(lambda x: replacer(x, '_'), list(cols)))
    expected_cols = list(map(lambda x: x.lower(), col_config['columns']))
    cols = list(map(lambda x: x.lower(), list(cols)))
    
    if len(cols) == len(expected_cols) and list(cols) == list(expected_cols):
        print("\nColumn name and column length are successfully validated!")
        return 1

    else:
        print("Column name and column length have not passed the validation test")
        mismatched_columns = list(set(colms).difference(expected_col))
        print("The following columns are missing from the YAML file", mismatched_columns)
        missing_file = list(set(expected_col).difference(colms))
        print("The followning columns are missing from the file uploaded", missing_file)
        logging.info(f'df columns: {colms}')
        logging.info(f'expected columns: {expected_col}')
        return 0

Overwriting testutility.py


### Writing the YAML File

In [106]:
%%writefile dataset.yaml
file_type: json
dataset_name: resumes
file_name: Resume
table_name: resumes
inbound_delimiter: ","
outbound_delimiter: "\n"
columns:
 - content
 - annotation

Overwriting dataset.yaml


### Reading the .json file, converting it to a .csv file and adding it to a dataframe

In [107]:
#data = pd.read_json ('Resume.json',lines=True)
#data.to_csv ('dataframe.csv', index = None)

### File Ingestion and Schema Validation

#### Reading the Configuration File

In [108]:
import testutility as util 
config_data = util.read_yml_file("dataset.yaml")

NameError: name 'yaml' is not defined

In [109]:
src_file = "./"+config_data['file_name']+"."+config_data['file_type']
data = pd.read_json ('src_file',lines=True)
data.to_csv ('dataframe.csv', index = None)
data.head()

NameError: name 'config_data' is not defined

In [None]:
### rest of the code......

In [45]:
data.head()

Unnamed: 0,content,annotation
0,Govardhana K\nSenior Software Engineer\n\nBeng...,"[{'label': ['Companies worked at'], 'points': ..."
1,"Harini Komaravelli\nTest Analyst at Oracle, Hy...","[{'label': ['Companies worked at'], 'points': ..."
2,Hartej Kathuria\nData Analyst Intern - Oracle ...,"[{'label': ['Skills'], 'points': [{'start': 22..."
3,Ijas Nizamuddin\nAssociate Consultant - State ...,"[{'label': ['Skills'], 'points': [{'start': 46..."
4,"Imgeeyaul Ansari\njava developer\n\nPune, Maha...","[{'label': ['Skills'], 'points': [{'start': 18..."


In [5]:
data.shape

(200, 2)

In [6]:
data.isnull().sum()

content       0
annotation    0
dtype: int64

In [7]:
# Function to get the size of dataset

def humanbytes(B):
   'Return the given bytes as a human friendly KB, MB, GB, or TB string'
   B = float(B)
   KB = float(1024)
   MB = float(KB ** 2) # 1,048,576
   GB = float(KB ** 3) # 1,073,741,824
   TB = float(KB ** 4) # 1,099,511,627,776

   if B < KB:
      return '{0} {1}'.format(B,'Bytes' if 0 == B > 1 else 'Byte')
   elif KB <= B < MB:
      return '{0:.2f} KB'.format(B/KB)
   elif MB <= B < GB:
      return '{0:.2f} MB'.format(B/MB)
   elif GB <= B < TB:
      return '{0:.2f} GB'.format(B/GB)
   elif TB <= B:
      return '{0:.2f} TB'.format(B/TB)

In [8]:
size = data.memory_usage(deep=True).sum()
file_size = humanbytes(size)
print(file_size)

1.41 MB


In [9]:
data.iloc[0]['annotation']

[{'label': ['Companies worked at'],
  'points': [{'start': 1749, 'end': 1754, 'text': 'Oracle'}]},
 {'label': ['Companies worked at'],
  'points': [{'start': 1696, 'end': 1701, 'text': 'Oracle'}]},
 {'label': ['Companies worked at'],
  'points': [{'start': 1417, 'end': 1422, 'text': 'Oracle'}]},
 {'label': ['Skills'],
  'points': [{'start': 1356,
    'end': 1792,
    'text': 'Languages: Core Java, Go Lang, Data Structures & Algorithms, Oracle\nPL-SQL programming, Sales Force with APEX.\nTools: RADTool, Jdeveloper, NetBeans, Eclipse, SQL developer,\nPL/SQL Developer, WinSCP, Putty\nWeb Technologies: JavaScript, XML, HTML, Webservice\n\nOperating Systems: Linux, Windows\nVersion control system SVN & Git-Hub\nDatabases: Oracle\nMiddleware: Web logic, OC4J\nProduct FLEXCUBE: Oracle FLEXCUBE Versions 10.x, 11.x and 12.x'}]},
 {'label': ['Companies worked at'],
  'points': [{'start': 1209, 'end': 1214, 'text': 'Oracle'}]},
 {'label': ['Skills'],
  'points': [{'start': 1136,
    'end': 1247,


In [10]:
data.iloc[0]['content']

'Govardhana K\nSenior Software Engineer\n\nBengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/\nb2de315d95905b68\n\nTotal IT experience 5 Years 6 Months\nCloud Lending Solutions INC 4 Month • Salesforce Developer\nOracle 5 Years 2 Month • Core Java Developer\nLanguages Core Java, Go Lang\nOracle PL-SQL programming,\nSales Force Developer with APEX.\n\nDesignations & Promotions\n\nWilling to relocate: Anywhere\n\nWORK EXPERIENCE\n\nSenior Software Engineer\n\nCloud Lending Solutions -  Bangalore, Karnataka -\n\nJanuary 2018 to Present\n\nPresent\n\nSenior Consultant\n\nOracle -  Bangalore, Karnataka -\n\nNovember 2016 to December 2017\n\nStaff Consultant\n\nOracle -  Bangalore, Karnataka -\n\nJanuary 2014 to October 2016\n\nAssociate Consultant\n\nOracle -  Bangalore, Karnataka -\n\nNovember 2012 to December 2013\n\nEDUCATION\n\nB.E in Computer Science Engineering\n\nAdithya Institute of Technology -  Tamil Nadu\n\nSeptember 2008 to June 2012\n\nhttps://www.