In [None]:
from google.colab import drive  # Import the 'drive' module from the 'google.colab' package

drive.mount('/gdrive')  # Mount the Google Drive filesystem to the '/gdrive' directory

Mounted at /gdrive


In [None]:
!pip install nltk
!pip install pymupdf
!apt-get install poppler-utils
!pip install pytesseract
!apt install tesseract-ocr
!apt install libtesseract-dev
!pip install Pillow
!pip install pytesseract
!pip install PyPDF2
!pip install pdf2image
!pip install beautifulsoup4
!pip install textract

In [None]:
import glob  # Module for file pattern matching
import os  # Module for operating system functions
import nltk  # Natural Language Toolkit library
import numpy as np  # Library for numerical computations
from nltk.corpus import stopwords  # Corpus containing stopwords
from nltk.tokenize import word_tokenize  # Tokenization function
import json  # Library for JSON handling
import re  # Regular expression operations
from nltk.stem.wordnet import WordNetLemmatizer  # Lemmatization function
import io  # Module for handling I/O operations
import pandas as pd  # Library for data manipulation and analysis
import PyPDF2  # Library for working with PDF files
import pandas as pd  # Library for data manipulation and analysis
import re  # Regular expression operations
import textract  # Library for extracting text from various file formats
from datetime import datetime, date  # Modules for date and time manipulation

In [None]:
pd.set_option('max_colwidth', 400)  # Set the maximum width of each column in pandas DataFrame to 400 characters
pd.describe_option('max_colwidth')  # Display the description of the 'max_colwidth' option in pandas

In [None]:
class ExtractData:
    def __init__(self, df):
        self.df = df

    def process(self):
        """
        Process all documents. The process reads the first page per pdf.
        """
        self.__read_pdf()
        return self.df


    def __read_pdf(self):
        """
        Read all pdfs
        """
        # Get a list of PDF file paths
        pdf_files = glob.glob("/gdrive/MyDrive/covid_pdf/*.pdf")
        # Directory to save JSON files
        json_dir = "/gdrive/MyDrive/covid_json/"

        for pdf_path in pdf_files:
            # Extract the file name and remove the extension
            file_name = os.path.basename(pdf_path)
            file_without_ext = os.path.splitext(file_name)[0]
            # Create the corresponding JSON file path
            json_path = os.path.join(json_dir, file_without_ext + ".json")
            # Save the PDF content as JSON
            self.__save_json(pdf_path, json_path)

    @staticmethod
    def __text_from_pdf(pdf_path):
        """
        Convert the PDF Files to JSON Format

        :param pdf_path: Path of the PDF file
        :return: Text extracted from the PDF file in UTF-8 encoding
        """
        # Extract text from the PDF using Tesseract OCR engine
        text = textract.process(pdf_path, method='tesseract', encoding='utf-8', language='eng')
        # Decode the extracted text from UTF-8 encoding
        text = text.decode("utf-8")

        return text if text else None

    def __save_json(self, pdf_path, json_path):
        """
        Save JSON files in the Directory

        :param pdf_path: Path of the PDF file
        :param json_path: Path to save the JSON file
        :return: None
        """
        # Extract the file name without extension
        filename = os.path.splitext(os.path.basename(pdf_path))[0]
        # Create a dictionary to store filename and text
        data = {"Filename": filename}
        # Extract text from the PDF and assign it to the 'Text' key in the dictionary
        data["Text"] = self.__text_from_pdf(pdf_path)

        # Save the dictionary as a JSON file
        with open(json_path, "w") as fh:
            json.dump(data, fh, ensure_ascii=False)

        # Concatenate the extracted data with the existing DataFrame
        self.df = pd.concat([self.df, pd.DataFrame([data])], ignore_index=True)

class TransformData:
    def __init__(self, df):
      self.df = df.copy()

    def process(self):
      # Extract information from the 'Text' column and assign it to new columns in the DataFrame
      self.df["CT"] = np.vectorize(self.__extract_ct)(self.df["Text"])
      self.df["Gender"] = np.vectorize(self.__extract_gender)(self.df["Text"])
      self.df["Age"] = np.vectorize(self.__extract_age)(self.df["Text"])
      self.df["PCR_Status"] = np.vectorize(self.__extract_result)(self.df["Text"])
      self.df["Date"] = np.vectorize(self.__extract_date)(self.df["Text"])
      self.df["Location"] = np.vectorize(self.__extract_location)(self.df["Text"])

      # Convert the 'Date' column to datetime format
      self.df["Date"] = pd.to_datetime(self.df["Date"])

      return self.df

    def __remove_special_char(self, text):
      # Remove special characters from the text
      return re.sub("[!@#~`%^&*(){};:/<>?\|_]", " ", text)

    def __extract_date(self, text):
      # Extract the date from the text using regular expressions
      date = None
      _date = re.findall(r"Sample Drawn On[\_]?\s*?[\:\+\;]?\s*?.[0-9]*?[\/][0-9]*[\/][0-9]{4}|Report generated on[\\n|\n]*Date\:.+?[0-9]*?[\.][0-9]*[\.][0-9]{4}|Reported\s*?\:?\s*?.[0-9]*?[\/][\w]*[\/][0-9]{4}|ported\s*?\:?\s*?.[0-9]*?[\-][\w]*[\-][0-9]{4}|Report Released on \(RRT\)[\n|\\n]*?.[0-9]? [\w]* [0-9]{4}|Reported Date[\s]*?\:[\s]*?[0-9]*?[\/][0-9]*[\/][0-9]{2}|Date and time of reporting[\n|\\n]*?.[0-9]*?[\.][0-9]*?[\.][0-9]{4}|Date of Service\:[\s]*?.[0-9]*?[\/][0-9]*?[\/][0-9]{4}|[|] Report Released on.+[0-9]{2} [\w]* [0-9]{4}|Report Dispatched Date\:\s*?[0-9]{4}[\-][0-9]*[\-][0-9]{2}|Received [\+|\=\:] [0-9]*?[\/][0-9]*?[\/][0-9]{4}|\s\w+\s*[0-9]+[\,]\s*[0-9]{4}.+ET|Reported On \: [0-9]*?[\-][0-9]*?[\-][0-9]{4}|Report Released on.+\n*.+\(RR.+[0-9]*\s*[\w]+\s*[0-9]{4}", text)
      if _date:
          date = _date[0].upper()
          date = date.replace("REPORT RELEASED ON (RRT)\n\n", "")
          date = date.replace("REPORTED DATE :", "")
          date = date.replace("DATE AND TIME OF REPORTING\n\n", "")
          date = date.replace("SAMPLE DRAWN ON_ :", "")
          date = date.replace("DATE AND TIME OF REPORTING", "")
          date = date.replace("REPORTED DATE:", "")
          date = date.replace("SAMPLE DRAWN ON:", "")
          date = date.replace("DATE OF SERVICE:", "")
          date = date.replace("REPORT GENERATED ON\n\nDATE:", "")
          date = date.replace("SAMPLE DRAWN ON;", "")
          date = date.replace("REPORT DISPATCHED DATE:", "")
          date = date.replace("REPORTED =", "")
          date = date.replace("REPORTED :", "")
          date = date.replace("PORTED :", "")
          date = date.replace("RECEIVED : ", "")
          date = date.replace("REPORTED ON : ", "")
          date = re.sub(r" ET$", "", date)
          date = date.replace(".", "/")
          date = re.sub(r"^\[", "", date)
          date = date.replace("\n", "")
          date = date.replace("REPORT RELEASED ON : / :| (RR :", "")
          date = date.strip()

      return date

    def __extract_location(self, text):
      # Extract the location from the text using regular expressions
      location = "INDIA"
      _location = re.findall(r"Facility Name CHC.+[\n|\\n]*?Patient Name|Location\s\w+\,\s\w+|\w*\s*\w*\,\s*\w*\s*\w*\s\w*\-*[\n\s0-9]*.+wellness@thyrocare\.com|.+\w+\,\s*\w+\s[0-9]*\nMedical Director\:|.+\w+\,\s*\w+\s[0-9]*.Medical\nDirector\:|HOSPITAL SAIL|Hospital\, Rourkela|Madinaguda\, Serlingampally|PROGRESS DRIVE|LANE\, CARSON CITY|DELHI|Bagmati Province\, Nepal|Tirana\, Albania", text)
      if _location:
          location = _location[0].upper()
          location = location.replace("FACILITY NAME CHC", "")
          location = location.replace("PATIENT NAME", "")
          location = location.replace("LOCATION", "")
          location = location.replace("WELLNESS@THYROCARE.COM", "")
          location = re.sub("[\d|\©|\n|\\n|\-|\/|\®|\&]", "", location)
          location = location.strip()
          if "MEDICAL DIRECTOR" in location or "MEDICALDIRECTOR:" in location or "PROGRESS DRIVE" in location or "LANE, CARSON CITY" in location:
            location = "USA"
          if "HOSPITAL SAIL" in location or "HOSPITAL, ROURKELA" in location or "MADINAGUDA, SERLINGAMPALLY" in location or "DELHI" in location:
            location = "INDIA"

      return location

    def __extract_result(self, text):
      # Extract the PCR result from the text using regular expressions
      _detected = re.findall(r"DETECTED\(NEGATIVE\)|Result Negative| Not Detected |DETECTED\(Negative\)|QUALITATIVE PCR NOT DETECTED|\sNEGATIVE[\\n|\n]|RNA NOT DETECTED NOT| System\) Negative[\\n|\n]|Negative[\\n|\n]*\(Real Time PCR\)|[\n|\\n]*Result.+Negative[\n|\\n]*|RNA NOT DETECTED NOT", text)
      if _detected:
          return "NO"
      _detected = re.findall(r'PCR POSITIVE|\:\sDETECTED\([POSITIVE|positive]*\)[\\n|\n]|PCR DETECTED[\n|\\n]|PCR IDETECTED\(Positive\)|\\nResult Positive\\n|[\n|\\n]Result [Positive|POSITIVE]*[\n|\\n]|Result[\n|\s]*Negative|[\n|\\n]Result positive[\n|\\n]', text)
      if _detected:
          return "YES"
      return None

    def __extract_ct(self, text):
      # Extract the CT value from the text using regular expressions
      name =  None
      _name = re.findall(r"CT VALUE [0-9]+[\.|\,][0-9]+", text)
      if _name:
          name = _name[0].upper()
          name = name.replace("CT VALUE ", "").strip()

      return name

    def __extract_gender(self, text):
      # Extract the gender from the text using regular expressions
      gender =  None

      _gender = re.findall(r"Years\s*[\/]\s*\b[\w]+\b|Gender\s*(?!\d)\w+|\/\w+\s*Barcode No|[0-9]+[¥|Y]\/[A-Z]|Gender\s\:\s[a-zA-Z]+|\| [\d]+ \[[MALE|FEMALE|\w]+\s*\||Gender\:\s\w+\|[FEMALE|MALE]|Gender: [Female|Male]| [0-9]?[0-9][\.] \|[FEMALE|MALE]|[\d]?[\d] \|[FEMALE|MALE]|Yrs \/ [Male|Female]|reEMALE|\[FEMALE", text)
      if _gender:
        gender = _gender[0].upper()
        gender = gender.replace("YEARS", "")
        gender = gender.replace("GENDER", "")
        gender = gender.replace("BARCODE NO", "")
        gender = gender.replace("YRS", "")
        gender = gender.replace("REEMALE", "F")
        gender = self.__remove_special_char(gender.strip().replace(r"\n", "")).strip()
        gender = re.sub(r"^[0-9]+[Y|¥]", "", gender).strip()
        gender = re.sub(r"^\d+\s\[", "", gender).strip()
        gender = re.sub(r"\d", "", gender).strip()
        gender = gender.replace(".", "").strip()
        gender = gender.replace("[","")
        if gender in ["M", "A"]:
            gender = "MALE"
        if gender == "F":
            gender = "FEMALE"

      return gender

    def __extract_age(self, text):
      # Extract the age from the text using regular expressions
      age =  None
      _age = re.findall(r"\d+\s+[Yy]ears|Age\s*\d+|Gender\s*[\:]?\d+|[0-9]+[¥|Y]\/[A-Z]|Birth\s\:\s[\d]?[\d]\/[\d]?[\d]\/[\d]{4}\s*\(\d+\)|Collection location[\\n|\n]*[a-zA-Z].+[0-9]+\,\s*[0-9]{4}|\| [\d]+ \[[MALE|FEMALE]|Date of Birth\:\s[\d]?[\d]\/[\d]?[\d]\/[\d]{4}|\s[0-9]*\s\|[FEMALE|MALE]|[\|]? [\d]+[\.]? [\||\[]?[MALE|FEMALE|reEMALE]|Age: [0-9]* Gender|Gender : [0-9]* Yrs", text)
      if _age:
        age = _age[0].upper()
        _age = re.findall(r"\(\d+\)$", age)
        if _age:
          age = _age[0].upper()
        _age = re.findall(r"^\d+", age)
        if _age:
          age = _age[0].upper()
        age = age.replace("YEARS", "")
        age = age.replace("AGE", "")
        age = age.replace("GENDER", "")
        age = age.replace("COLLECTION LOCATION\n\n", "")
        age = age.replace("DATE OF BIRTH: ", "")
        age = age.replace("F", "")
        age = age.replace("M", "").strip()
        age = age.replace("YRS","")
        age = age.replace("R", "")

        if len(age) > 9:
          today = date.today()
          age = age.replace(" ","/")
          born = datetime.strptime(str(pd.to_datetime(age)).replace("00","").replace(":","").strip(), "%Y-%m-%d").date()
          age = str(today.year - born.year - ((today.month, today.day) < (born.month, born.day)))
        else:
          age = self.__remove_special_char(age).strip()
        age = age.replace(" [M", "")
        age = age.replace(" [F", "")
        age = age.replace(" [", "")
      return age

In [None]:
# Create DataFrame
df = pd.DataFrame({})

# Create an instance of the ExtractData class with the input DataFrame 'df'
extract_data = ExtractData(df)

# Apply the data extraction process by calling the 'process' method of the ExtractData instance
# The 'process' method reads PDF files, converts them to JSON format, and updates the DataFrame with the extracted data
# The updated DataFrame is returned and assigned back to the variable 'df'
df = extract_data.process()

In [None]:
# Create an instance of the TransformData class with the input DataFrame 'df'
transform = TransformData(df)

# Apply the data transformation process by calling the 'process' method of the TransformData instance
# The 'process' method performs various data extraction and transformation operations on the DataFrame
# and returns the transformed DataFrame
df = transform.process()

  born = datetime.strptime(str(pd.to_datetime(age)).replace("00","").replace(":","").strip(), "%Y-%m-%d").date()
  self.df["Date"] = pd.to_datetime(self.df["Date"])


In [None]:
# Update the values in the 'Location' column for specific locations
# If the value in the 'Location' column is any of the specified locations (VARNI, BODHAN, WARANGAL, MUMBAI, DELHI),
# replace it with "INDIA"
df.loc[df["Location"].isin(["VARNI","BODHAN", "WARANGAL", "MUMBAI", "DELHI"]), "Location"] = "INDIA"

In [None]:
# Select specific columns from the DataFrame 'df' using double square brackets
# The selected columns are "Date", "PCR_Status", "Gender", "Age", "Location", and "CT"
selected_columns = df[["Date", "PCR_Status", "Gender", "Age", "Location", "CT"]]

# Convert the selected columns to a CSV file named "result.csv" using the 'to_csv' method
selected_columns.to_csv("result.csv")