# Open preprocessed and do the spacy work

In [1]:
import spacy
from spacy import displacy

import re
import pandas as pd
from textacy import extract

from collections import defaultdict 
from fuzzywuzzy import fuzz
import time
import uuid

import os
import json

from datetime import datetime

In [2]:
pd.set_option('display.max_columns', None)
print(time.localtime())

sitrep_file = "D://projects//_external_files//surveyor//rw_sitrep_preprocessed//disaster_sitrep_json_parsed_bdfc9ddf9d3b40eba3f11669a3619268.xlsx"
fpath =  "D://projects//_external_files//surveyor//rw_sitrep_preprocessed//"

df_sitrep = pd.read_excel(sitrep_file)
df_sitrep = df_sitrep.fillna('')

time.struct_time(tm_year=2023, tm_mon=12, tm_mday=16, tm_hour=14, tm_min=48, tm_sec=5, tm_wday=5, tm_yday=350, tm_isdst=0)


In [3]:
def secondary_pass_text_scrub(text):

    def convert_spelled_nums_to_digit(token):
        clean_token = re.sub(r'[^a-zA-Z]', '', token).lower()
        
        mappings = {
            'one' : 1,'two' : 2,'three' : 3,'four' : 4,'five' : 5,'six' : 6,'seven' : 7,'eight' : 8,'nine' : 9, 'ten' : 10
            ,'eleven' : 11, 'twelve' : 12, 'thirteen':13, 'fourteen':14, 'fifteen':15, 'sixteen':16, 'seventeen':17
            ,'eighteen':18, 'nineteen':19, 'twenty':20, 'dozen':12
        }
    
        if mappings.get(clean_token) is not None:
            return mappings[clean_token]
        else:
            return token


    def standardize_hyphens(text):

        # Replace special characters at the beginning with an empty string
        pattern = r'^[^a-zA-Z0-9]+'
        text = re.sub(pattern, '', text)
    
        # where hyphen is between 2 chars... replace with underscore
        pattern = r'([a-zA-Z])\-([a-zA-Z])'
        text = re.sub(pattern, r'\1_\2', text)
    
        # where hyphen is between char and num... replace with underscore
        # eg COVID-19 -> COVID_19
        pattern = r'([a-zA-Z])\-([\d])'
        text = re.sub(pattern, r'\1_\2', text)
    
        # where hyphen is between num and num... replace with ' to '
        # eg COVID-19 -> COVID_19
        pattern = r'([\d])\-([\d])'
        text = re.sub(pattern, r'\1 to \2', text)
        
        return text


    def standardize_cardinal_directions(text):

        #first capitalize
         
        pattern = re.compile(r'\b(north)\b', re.IGNORECASE)
        text = re.sub(pattern, 'north', text)
        pattern = re.compile(r'\b(northern)\b', re.IGNORECASE)
        text = re.sub(pattern, 'northern', text)
        
        pattern = re.compile(r'\b(south)\b', re.IGNORECASE)
        text = re.sub(pattern, 'south', text)
        pattern = re.compile(r'\b(southern)\b', re.IGNORECASE)
        text = re.sub(pattern, 'southern', text)

        pattern = re.compile(r'\b(east)\b', re.IGNORECASE)
        text = re.sub(pattern, 'east', text)
        pattern = re.compile(r'\b(eastern)\b', re.IGNORECASE)
        text = re.sub(pattern, 'eastern', text)
        
        pattern = re.compile(r'\b(west)\b', re.IGNORECASE)
        text = re.sub(pattern, 'west', text)
        pattern = re.compile(r'\b(western)\b', re.IGNORECASE)
        text = re.sub(pattern, 'western', text)
        


        
        # Standardize on "Southeast" for each variant
        pattern = re.compile(r'\b(northeast|north[\s-]?east)', re.IGNORECASE)
        text = re.sub(pattern, 'northeast', text)
        
        pattern = re.compile(r'\b(northwest|north[\s-]?west)', re.IGNORECASE)
        text = re.sub(pattern, 'northwest', text)
        
        pattern = re.compile(r'\b(southeast|south[\s-]?east)', re.IGNORECASE)
        text = re.sub(pattern, 'southeast', text)
        
        pattern = re.compile(r'\b(southwest|south[\s-]?west)', re.IGNORECASE)
        text = re.sub(pattern, 'southwest', text)
        #print(f"end {text}")
    
        return text

    def standardize_time_indicators(text):
        pattern = re.compile(r'\b(local[\s-]?time)\b', re.IGNORECASE)
        text = re.sub(pattern, 'localtime', text)
        return text
        
    
    #turn 'four' into 4
    try:
        text = ' '.join([str(convert_spelled_nums_to_digit(t)) for t in text.split(" ")])
    except:
        print(text)
    text = standardize_cardinal_directions(text)
    text = standardize_time_indicators(text)

    # get rid of remaining content within square brackets
    pattern = r'\[.*?\]'
    text = re.sub(pattern, '', text)

    # "per cent" to "percent"
    pattern = r'per cent'
    text = re.sub(pattern, 'percent', text)

    # "\n" to " "
    pattern = r'\n'
    text = re.sub(pattern, ' ', text)

    # change 34km to 34 kilometers
    text = re.sub(r'(\d+)\s?km\b', r'\1 kilometers', text)

    # ellipses
    text = re.sub(r'\.\.\.', '\. ', text)

    # remove urls
    text = re.sub(r'https?://\S+', '', text)

    # 2+ spaces in a row
    text = re.sub(r'\s{2,}', ' ', text)

    text = standardize_hyphens(text)

    #not doing this because it strips out other scripts' unicode
    #remove all non alpha numeric and punctuation
    #pattern = r'[^a-zA-Z0-9\s\,\.\?\!\-\(\)]'
    #text = re.sub(pattern, '', text)

    
    text.strip()
    
    return text

def undesirable_char_remover(text, ignore=["." , ",", " ", ":", "-", "_", "(", ")", ";", "/", "'"]):
    return_chars = []
    
    def screen_char(char):
        if c in ignore:
            return_chars.append(c)
        if c.isalnum() == True:
            return_chars.append(c)
        return None

    if isinstance(text, str):
        for c in text:
            screened = screen_char(c)
            if screened is not None:
                return_chars.append(c)
    else:
        print("must pass in a string to this function")
        return text
    

    return ''.join(return_chars).strip() 
    
def string_remove_parenthetical_content(text):
    # Use regular expression to remove content inside parentheses
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    return text

def reformat_iso_date(date_string):
    parsed_date = datetime.fromisoformat(date_string)
    formatted_date = parsed_date.strftime('%Y-%m-%d')
    return formatted_date
    

In [5]:
df_sitrep['reported_date'] = df_sitrep['reported_date'].apply(reformat_iso_date) 
df_sitrep['text'] = df_sitrep['source_original_text'].apply(secondary_pass_text_scrub) 
df_sitrep['text'] = df_sitrep['text'].apply(undesirable_char_remover) 
df_sitrep['non_parenthetical_text'] = df_sitrep['text'].apply(string_remove_parenthetical_content) 

In [None]:
df_sitrep.sample()

In [6]:
def generate_uuid(x):
    return uuid.uuid4().hex

output_file = f"{fpath}sitrep_preprocessed_{generate_uuid(1)}.xlsx"
df_sitrep.to_excel(output_file, index=False)
print(output_file)

D://projects//_external_files//surveyor//rw_sitrep_preprocessed//sitrep_preprocessed_b41b8e78f66d4e669917ea831f438b73.xlsx
