In [None]:
import os
os.environ['KAGGLE_USERNAME'] = "__"
os.environ['KAGGLE_KEY'] = "__"
!kaggle competitions download -c riiid-test-answer-prediction

Downloading competition.cpython-37m-x86_64-linux-gnu.so to /content
  0% 0.00/445k [00:00<?, ?B/s]
100% 445k/445k [00:00<00:00, 64.7MB/s]
Downloading __init__.py to /content
  0% 0.00/59.0 [00:00<?, ?B/s]
100% 59.0/59.0 [00:00<00:00, 169kB/s]
Downloading example_sample_submission.csv to /content
  0% 0.00/971 [00:00<?, ?B/s]
100% 971/971 [00:00<00:00, 935kB/s]
Downloading train.csv.zip to /content
 99% 1.28G/1.29G [00:17<00:00, 56.8MB/s]
100% 1.29G/1.29G [00:17<00:00, 77.2MB/s]
Downloading example_test.csv to /content
  0% 0.00/5.99k [00:00<?, ?B/s]
100% 5.99k/5.99k [00:00<00:00, 4.94MB/s]
Downloading lectures.csv to /content
  0% 0.00/9.48k [00:00<?, ?B/s]
100% 9.48k/9.48k [00:00<00:00, 8.66MB/s]
Downloading questions.csv to /content
  0% 0.00/289k [00:00<?, ?B/s]
100% 289k/289k [00:00<00:00, 88.8MB/s]


In [None]:
from zipfile import ZipFile
file_name = "/content/train.csv.zip"

with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print('Done')

Done


# Data Ingestion

In [18]:
%%writefile testutility.py
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime 
import gc
import re


################
# File Reading #
################

def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)


def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string) 
    return string

def col_header_val(df, table_config):
    cols = df.columns
    cols = cols.str.strip()
    cols.str.replace(" ", "_")
    expected_col = list(map(lambda x: x.lower(),  config_data['columns']))
    cols = list(cols.sort_values())
    expected_col.sort()
    if len(cols) == len(expected_col) and cols == expected_col:
      print("Column name and Column length Validation Passed!!")
      return 1
    else:
      print("Column name and Column length Validation Failed..")
      mismatched_columns_file = list(set(cols).difference(expected_col))
      print("Following File columns are not in the YAML file", mismatched_columns_file)
      missing_YAML_file = list(set(expected_col).difference(cols))
      print("Following YAML columns are not in the file uploaded", missing_YAML_file)
      logging.info(f'df columns: {cols}')
      logging.info(f'expected columns: {expected_col}')
      return 0

def humanbytes(B):
   'Return the given bytes as a human friendly KB, MB, GB, or TB string'
   B = float(B)
   KB = float(1024)
   MB = float(KB ** 2) # 1,048,576
   GB = float(KB ** 3) # 1,073,741,824
   TB = float(KB ** 4) # 1,099,511,627,776

   if B < KB:
      return '{0} {1}'.format(B,'Bytes' if 0 == B > 1 else 'Byte')
   elif KB <= B < MB:
      return '{0:.2f} KB'.format(B/KB)
   elif MB <= B < GB:
      return '{0:.2f} MB'.format(B/MB)
   elif GB <= B < TB:
      return '{0:.2f} GB'.format(B/GB)
   elif TB <= B:
      return '{0:.2f} TB'.format(B/TB)

def stats(df, config_data):
  if col_header_val(df, config_data) == 1:
    col_names = list(df.columns)
    no_of_cols = df.shape[1]
    no_of_rows = df.shape[0]
    size = df.memory_usage(deep=True)
    file_size = humanbytes(size)
    statistics = f"\nNo. of Columns: {no_of_cols} \nNo. of Rows: {no_of_rows} \nColumn Names: {col_names} \nFile Size: {file_size}"
    print(statistics)

Overwriting testutility.py


# Writing YAML File

In [19]:
%%writefile file.yaml
columns: 
  - row_id
  - timestamp
  - user_id
  - content_id
  - content_type_id
  - task_container_id
  - user_answer
  - answered_correctly
  - prior_question_elapsed_time
  - prior_question_had_explanation
dataset_name: trainfile
dtypes: 
  answered_correctly: int8
  content_id: int16
  content_type_id: boolean
  prior_question_elapsed_time: float32
  prior_question_had_explanation: boolean
  row_id: int64
  task_container_id: int16
  timestamp: int64
  user_answer: int8
  user_id: int32
file_name: train
file_type: csv
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
table_name: edsurv

Overwriting file.yaml


In [20]:
# Read config file
import testutility as util
config_data = util.read_config_file("file.yaml")

In [21]:
#inspecting data of config file
config_data

{'columns': ['row_id',
  'timestamp',
  'user_id',
  'content_id',
  'content_type_id',
  'task_container_id',
  'user_answer',
  'answered_correctly',
  'prior_question_elapsed_time',
  'prior_question_had_explanation'],
 'dataset_name': 'trainfile',
 'dtypes': {'answered_correctly': 'int8',
  'content_id': 'int16',
  'content_type_id': 'boolean',
  'prior_question_elapsed_time': 'float32',
  'prior_question_had_explanation': 'boolean',
  'row_id': 'int64',
  'task_container_id': 'int16',
  'timestamp': 'int64',
  'user_answer': 'int8',
  'user_id': 'int32'},
 'file_name': 'train',
 'file_type': 'csv',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'table_name': 'edsurv'}

In [None]:
import pandas as pd
# read the file using config file
file_type = config_data['file_type']
source_file = config_data['file_name'] + f'.{file_type}'
datatypes = config_data['dtypes']
#print("",source_file)
df = pd.read_csv(source_file, delimiter=config_data['inbound_delimiter'], dtype=datatypes)
df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,False,1,3,1,,
1,1,56943,115,5716,False,2,2,1,37000.0,False
2,2,118363,115,128,False,0,0,1,55000.0,False
3,3,131167,115,7860,False,3,0,1,19000.0,False
4,4,137965,115,7922,False,4,1,1,11000.0,False


In [None]:
from testutility import  col_header_val, stats

In [24]:
if col_header_val(df, config_data) == 0:
    print("Validation Failed!!")
else:
    print("Column Validation Passed..")

Column name and Column length Validation Passed!!
Column Validation Passed..


In [46]:
stats(df, config_data)

Column name and Column length Validation Passed!!

No. of Columns: 10 
No. of Rows: 101230332 
Column Names: ['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id', 'task_container_id', 'user_answer', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation'] 
File Size: 3.21 GB
