In [1]:
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime
import time
import gc
import re

### Data Reading

##### Reading the Data with Pandas

In [2]:
start = time.time()
df = pd.read_csv("Journal data.csv")
end = time.time()
print("Time to read csv with pandas: ",(end-start),"sec")

Time to read csv with pandas:  68.469735622406 sec


##### Reading the Data with Dask

In [3]:
from dask import dataframe as dd
start = time.time()
dask_df = dd.read_csv("Journal data.csv",sample_rows=1000000,assume_missing=True)
end = time.time()
print("Time to read csv with dask: ", (end-start), "sec")

ValueError: EOF encountered while reading header. 
Pass argument `sample_rows` and make sure the value of `sample` is large enough to accommodate that many rows of data

In [5]:
import modin.pandas as mpd
start = time.time()
modin_df = mpd.read_csv("Journal data.csv")
end = time.time()
print("Time to read csv with modin: ",(end-start),"sec")


    import ray
    ray.init(runtime_env={'env_vars': {'__MODIN_AUTOIMPORT_PANDAS__': '1'}})

2023-05-12 22:42:17,574	INFO worker.py:1625 -- Started a local Ray instance.


Time to read csv with modin:  94.31718683242798 sec


In [6]:
import ray
ray.shutdown()
ray.init()
start = time.time()
ray_df = pd.read_csv("Journal data.csv",engine='c')
end = time.time()
print("Time to read csv with ray: ",(end-start),"sec")

2023-05-12 22:44:26,105	INFO worker.py:1625 -- Started a local Ray instance.


Time to read csv with ray:  79.28154039382935 sec


#### Ray is relatively a little faster in reading the data when compared to Pandas. While I have learnt that dask is the fastest to read data, the dask read functions fails to read the csv file above despite correcting the error by adding sample_rows parfameter

#### In the second instance of running the code, Pandas was faster compared to both ray and modin.

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51410 entries, 0 to 51409
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   paper_id          51410 non-null  object
 1   doi               51410 non-null  object
 2   body_text         51410 non-null  object
 3   authors           51242 non-null  object
 4   title             51410 non-null  object
 5   journal           47388 non-null  object
 6   abstract_summary  51410 non-null  object
 7   country           51410 non-null  object
 8   word_count        51410 non-null  int64 
 9   language          51410 non-null  object
 10  publish_time      51410 non-null  object
 11  processed_text    51410 non-null  object
dtypes: int64(1), object(11)
memory usage: 4.7+ MB


In [8]:
df.head()

Unnamed: 0,paper_id,doi,body_text,authors,title,journal,abstract_summary,country,word_count,language,publish_time,processed_text
0,fd1afc537dace4f2cd2bcea5489396fc6620ffd0,10.5005/jp-journals-10005-1770,A sudden appearance of unidentified disease ch...,"Ayyed, Ahmed Basheer",Dental Practice Infection Control<br>Measurem...,Int J Clin Pediatr Dent,A new coronavirus disease called COVID-19 has...,,2970,en,2020-01-01,"['sudden', 'appearance', 'unidentified', 'dise..."
1,fc8d5f7612c3024599e3b0b0c2a2743056800f39,10.4103/ijmm.ijmm_20_138,coronavirus of Group 2B. [15] Initial analysis...,"Chakravarti, Anita. Upadhyay, Shalini. <br>B...","Current Understanding, Knowledge Gaps and a<b...",Indian Journal of Medical Microbiology,Review Article: Covid Series,,3412,en,2020-01-01,"['coronavirus', 'group', 'initial', 'analysis'..."
2,0f958977a0ba564b87aea0dbc569b454be2e2d3c,10.22037/ijpr.2020.113821.14506,"New Coronavirus, SARS-CoV-2 (Severe Acute Resp...","Mohammadi Barzelighi, Hajar. Daraei, Bahram....",Approaches for the Treatment of SARS-CoV-2<br...,Iran J Pharm Res,The emergence of a novel Coronavirus disease,Iran,6819,en,2020-01-01,"['new', 'coronavirus', 'sars-cov', 'severe_acu..."
3,7484516fe4e4c02498b086a5b11ae908c5b4fcfb,10.12688/gatesopenres.13168.1,Communicable diseases constitute a global heal...,"Paul, Alicia. Upreti, Kamana. Nepal,<br>Shr...",Rejoice architecture meets social norms to<br...,Gates Open Res,"Background: Each year, 600,000 children under...",Nepal,5320,en,2020-01-01,"['communicable', 'disease', 'constitute', 'glo..."
4,f57034777b3dece6874a7d2e85bb942932490c10,10.5005/jp-journals-10005-1885,Coronavirus disease-2019 (COVID- 19) refers to...,"Alsaleh, Majd M. Sabbarini, Jumana M. <br>Al...",Changes in Behavior Management and Treatment<...,Int J Clin Pediatr Dent,Objective: This study aims to assess the<br>k...,,4246,en,2020-01-01,"['coronavirus', 'disease', 'covid', 'refers', ..."


### Validation

In [9]:
%%writefile utility.py
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime 
import gc
import re

# Reading File  

def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)


def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string) 
    return string

def matching(df,config_data):
    expected_col = list(config_data['columns'])
    if len(df.columns) == len(expected_col) and list(expected_col) == list(df.columns):
        print("column name and column length validation passed")
        return 1
    else:
        print("column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file",mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded",missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')
        return 0

Overwriting utility.py


In [10]:
%%writefile store.yaml
file_type: csv
dataset_name: file
file_name: Journal data.csv
table_name: edsurv
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns: 
    - paper_id
    - doi
    - body_text
    - authors
    - title
    - journal
    - abstract_summary
    - country
    - word_count
    - language1
    - publish_time
    - processed_text
    

Overwriting store.yaml


In [11]:
#Reading config file
import utility as util
config_df = util.read_config_file("store.yaml")

In [12]:
print(config_df)

{'file_type': 'csv', 'dataset_name': 'file', 'file_name': 'Journal data.csv', 'table_name': 'edsurv', 'inbound_delimiter': ',', 'outbound_delimiter': '|', 'skip_leading_rows': 1, 'columns': ['paper_id', 'doi', 'body_text', 'authors', 'title', 'journal', 'abstract_summary', 'country', 'word_count', 'language1', 'publish_time', 'processed_text']}


In [13]:
def matching(df,config_data):
    expected_col = list(config_data['columns'])
    if len(df.columns) == len(expected_col) and list(expected_col) == list(df.columns):
        print("column name and column length validation passed")
        return 1
    else:
        print("column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file",mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded",missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')
        return 0
matching(df,config_df)

column name and column length validation failed
Following File columns are not in the YAML file ['language']
Following YAML columns are not in the file uploaded ['language1']


0

In [14]:
import datetime
import csv
import gzip


# Write csv in gz format in pipe separated text file (|)
df.to_csv("Journal data.csv.gz",
          sep='|',
          header=True,
          index=False,
          quoting=csv.QUOTE_ALL,
          compression='gzip',
          quotechar='"',
          doublequote=True)

In [15]:
os.path.getsize("Journal data.csv.gz")

730181044