**Task #1: Installing Apache Tika and Other Libraries**

In [None]:
!pip install tika
!pip install thefuzz

Collecting tika
  Downloading tika-2.6.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tika
  Building wheel for tika (setup.py) ... [?25l[?25hdone
  Created wheel for tika: filename=tika-2.6.0-py3-none-any.whl size=32624 sha256=c3480f958d14e0752818e430a3ff9da18484dc63762e64c6c8bb72b780533ec2
  Stored in directory: /root/.cache/pip/wheels/27/ba/2f/37420d1191bdae5e855d69b8e913673045bfd395cbd78ad697
Successfully built tika
Installing collected packages: tika
Successfully installed tika-2.6.0
Collecting thefuzz
  Downloading thefuzz-0.22.1-py3-none-any.whl.metadata (3.9 kB)
Collecting rapidfuzz<4.0.0,>=3.0.0 (from thefuzz)
  Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading thefuzz-0.22.1-py3-none-any.whl (8.2 kB)
Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install wikipedia datefinder



**Task #2: Downloading the Haunted Places dataset.**

In [None]:
# Please upload the 'haunted_places.csv' file to session storage in order to run
# the Colab successfully

In [None]:
import pandas as pd

# Reading the dataset to store as dataframe
df = pd.read_csv('haunted_places.csv')

**Task #3: Combining a TSV file for the Haunted Places dataset.**

In [None]:
import csv

with open('haunted_places.csv','r') as csvin, open('haunted_places.tsv', 'w') as tsvout:
    csvin = csv.reader(csvin)
    tsvout = csv.writer(tsvout, delimiter='\t')

    for row in csvin:
        tsvout.writerow(row)

**Task #4: Expanding the dataset with new features.**

*4a.) Adding the "Audio Evidence" feature to the dataset.*

In [None]:
from thefuzz import fuzz

# Defining keywords for our new features
audio_keywords = ["moan", "groan", "splash", "boom", "buzz", "scream", "yell", "shout", "cry", "honk", "banging", "footsteps", "hear", "knocking", "laughter", "sound", "voice", "whisper", "breathing", "chanting", "growl", "noise", "wail", "murmur", "howl"]

# Function to check for fuzzy keyword matches in description
def contains_keywords_fuzzy(description, keywords, threshold=85):
    if isinstance(description, str):
        return any(fuzz.partial_ratio(keyword.lower(), description.lower()) >= threshold for keyword in keywords)
    return False

# Applying fuzzy matching to detect audio evidence
df["Audio Evidence"] = df["description"].apply(lambda x: contains_keywords_fuzzy(x, audio_keywords))

# Counting the number of True values in each evidence column
audio_evidence_count = df["Audio Evidence"].sum()
total_rows = len(df)

# Calculating and printing proportions
audio_evidence_ratio = audio_evidence_count / total_rows

# Printing our results
print(f"Number of rows with Audio Evidence: {audio_evidence_count}")
print(f"Proportion of Audio Evidence: {audio_evidence_ratio:.2%}")

# Displaying the first 50 rows of the dataset
df.head(10)

Number of rows with Audio Evidence: 5736
Proportion of Audio Evidence: 52.18%


Unnamed: 0,city,country,description,location,state,state_abbrev,longitude,latitude,city_longitude,city_latitude,Haunted Places Date,Daylight Duration 2,Audio Evidence
0,Ada,United States,Ada witch - Sometimes you can see a misty blue...,Ada Cemetery,Michigan,MI,-85.504893,42.962106,-85.49548,42.960727,1954-03-12 00:00:00,11:44,True
1,Addison,United States,A little girl was killed suddenly while waitin...,North Adams Rd.,Michigan,MI,-84.381843,41.971425,-84.347168,41.986434,2006-07-07 00:00:00,15:06,True
2,Adrian,United States,If you take Gorman Rd. west towards Sand Creek...,Ghost Trestle,Michigan,MI,-84.035656,41.904538,-84.037166,41.897547,1929-05-30 00:00:00,14:58,False
3,Adrian,United States,"In the 1970's, one room, room 211, in the old ...",Siena Heights University,Michigan,MI,-84.017565,41.905712,-84.037166,41.897547,1826-06-18 00:00:00,15:14,True
4,Albion,United States,Kappa Delta Sorority - The Kappa Delta Sororit...,Albion College,Michigan,MI,-84.745177,42.244006,-84.75303,42.243097,1900-03-12 00:00:00,11:45,True
5,Albion,United States,A mysterious lady in white has been spotted in...,Riverside Cemetery,Michigan,MI,-84.753056,42.236814,-84.75303,42.243097,2025-03-01 00:00:00,11:16,False
6,Algoma Township,United States,On a winding dirt road next to the Rogue River...,Hell's Bridge,Michigan,MI,,,-85.62293,43.149293,1644-03-12 00:00:00,11:46,True
7,Algonac,United States,Morrow Road is a Haunted road in Algonac Michi...,Morrow Road,Michigan,MI,-82.57629,42.652997,-82.531018,42.618367,2017-07-12 00:00:00,15:04,True
8,Allegan,United States,"People report hearing footsteps, and doors sla...",Elks Lodge,Michigan,MI,-85.841599,42.520552,-85.855303,42.529199,2008-03-12 00:00:00,11:47,True
9,Allegan,United States,Various ghostly activities. News coverage abou...,The Grill House and the Rock Bottom Bar,Michigan,MI,-85.857564,42.497762,-85.855303,42.529199,2008-04-26 00:00:00,13:54,True


*4b.) Adding the "Image/Video/Visual Evidence" feature to the dataset.*

In [None]:
from thefuzz import fuzz

# Defining keywords for our new features
visual_keywords = ["apparition", "figure", "glowing", "light", "mist", "orb", "see", "shadow", "shape", "vision", "flickering", "glimpse", "manifestation", "movement", "phantom", "reflection", "saw", "translucent", "smoke", "names of children written on walls"]

# Function to check for fuzzy keyword matches in description
def contains_keywords_fuzzy(description, keywords, threshold=85):
    if isinstance(description, str):
        return any(fuzz.partial_ratio(keyword.lower(), description.lower()) >= threshold for keyword in keywords)
    return False

# Applying fuzzy matching to detect evidence
df["Image/Video/Visual Evidence"] = df["description"].apply(lambda x: contains_keywords_fuzzy(x, visual_keywords))

# Counting the number of True values in each evidence column
visual_evidence_count = df["Image/Video/Visual Evidence"].sum()
total_rows = len(df)

# Calculating and printing proportions
visual_evidence_ratio = visual_evidence_count / total_rows

# Printing our results
print(f"Number of rows with Image/Video/Visual Evidence: {visual_evidence_count}")
print(f"Proportion of Image/Video/Visual Evidence: {visual_evidence_ratio:.2%}")

# Displaying the first 50 rows of the dataset
df.head(10)

Number of rows with Image/Video/Visual Evidence: 6940
Proportion of Image/Video/Visual Evidence: 63.14%


Unnamed: 0,city,country,description,location,state,state_abbrev,longitude,latitude,city_longitude,city_latitude,Haunted Places Date,Daylight Duration 2,Audio Evidence,Image/Video/Visual Evidence
0,Ada,United States,Ada witch - Sometimes you can see a misty blue...,Ada Cemetery,Michigan,MI,-85.504893,42.962106,-85.49548,42.960727,1954-03-12 00:00:00,11:44,True,True
1,Addison,United States,A little girl was killed suddenly while waitin...,North Adams Rd.,Michigan,MI,-84.381843,41.971425,-84.347168,41.986434,2006-07-07 00:00:00,15:06,True,True
2,Adrian,United States,If you take Gorman Rd. west towards Sand Creek...,Ghost Trestle,Michigan,MI,-84.035656,41.904538,-84.037166,41.897547,1929-05-30 00:00:00,14:58,False,True
3,Adrian,United States,"In the 1970's, one room, room 211, in the old ...",Siena Heights University,Michigan,MI,-84.017565,41.905712,-84.037166,41.897547,1826-06-18 00:00:00,15:14,True,True
4,Albion,United States,Kappa Delta Sorority - The Kappa Delta Sororit...,Albion College,Michigan,MI,-84.745177,42.244006,-84.75303,42.243097,1900-03-12 00:00:00,11:45,True,False
5,Albion,United States,A mysterious lady in white has been spotted in...,Riverside Cemetery,Michigan,MI,-84.753056,42.236814,-84.75303,42.243097,2025-03-01 00:00:00,11:16,False,False
6,Algoma Township,United States,On a winding dirt road next to the Rogue River...,Hell's Bridge,Michigan,MI,,,-85.62293,43.149293,1644-03-12 00:00:00,11:46,True,True
7,Algonac,United States,Morrow Road is a Haunted road in Algonac Michi...,Morrow Road,Michigan,MI,-82.57629,42.652997,-82.531018,42.618367,2017-07-12 00:00:00,15:04,True,True
8,Allegan,United States,"People report hearing footsteps, and doors sla...",Elks Lodge,Michigan,MI,-85.841599,42.520552,-85.855303,42.529199,2008-03-12 00:00:00,11:47,True,False
9,Allegan,United States,Various ghostly activities. News coverage abou...,The Grill House and the Rock Bottom Bar,Michigan,MI,-85.857564,42.497762,-85.855303,42.529199,2008-04-26 00:00:00,13:54,True,False


*4c.) Adding the "Haunted Places Date" feature to the dataset.*

In [None]:
# WARNING: This code takes an hour and a half to run fully
import pandas as pd
import wikipedia
import datefinder
import time
import random
import re
from datetime import datetime
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor


# Initialize columns
df['wiki_date_found'] = False
df['extracted_date'] = pd.NaT


# Extract years from descriptions early to reduce Wikipedia searches
def extract_years(description):
    if pd.isna(description):
        return []
    return re.findall(r'\b(1[5-9][0-9]{2}|20[0-2][0-9])\b', description)


# Function to get dates from Wikipedia
def get_date_from_wikipedia(row):
    location, city, state, description = row['location'], row['city'], row['state'], row.get('description', '')


    queries = [f"{location} {city} {state} haunted", f"{location} {city} haunted", f"{location} haunted"]
    years = extract_years(description)


    for year in years:
        queries.insert(0, f"{location} {city} {state} {year}")


    for query in queries:
        try:
            search_results = wikipedia.search(query, results=2)
            for result in search_results:
                try:
                    page = wikipedia.page(result, auto_suggest=False)
                    matches = list(datefinder.find_dates(page.content, source=True))
                    valid_matches = [date for date, _ in matches if 1500 <= date.year <= datetime.now().year]
                    if valid_matches:
                        return min(valid_matches, key=lambda d: d.year).strftime('%Y-%m-%d')
                except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError):
                    continue
        except Exception:
            time.sleep(2)
            continue
    return None


# Process data in parallel
sample_size = len(df)
sample_indices = random.sample(range(len(df)), sample_size)
sample_rows = df.iloc[sample_indices]


with ThreadPoolExecutor(max_workers=5) as executor:
    results = list(tqdm(executor.map(get_date_from_wikipedia, [row for _, row in sample_rows.iterrows()]), total=len(sample_rows), desc="Fetching Wikipedia data"))


# Store results and calculate success rate
processed_count = 0
date_found_count = 0
for i, idx in enumerate(sample_indices):
    if results[i]:
        df.at[idx, 'extracted_date'] = results[i]
        df.at[idx, 'wiki_date_found'] = True
        date_found_count += 1
    processed_count += 1


success_rate = (date_found_count / processed_count) * 100 if processed_count > 0 else 0
print(f"\nDate extraction results:")
print(f"- Processed entries: {processed_count}")
print(f"- Dates found: {date_found_count}")
print(f"- Success rate: {success_rate:.2f}%")


default_date = "2025-01-01"
df['Haunted Places Date'] = df['extracted_date'].fillna(default_date)
# df.to_csv('haunted_places_with_dates.csv', index=False)

import datetime
df = df.drop(columns=['wiki_date_found', 'extracted_date'])
df['Haunted Places Date'] = df['Haunted Places Date'].apply(lambda x: pd.to_datetime(x).strftime('%Y-%m-%d') if '/' in x else x)
date_formats = ['%Y-%m-%d %H:%M:%S', '%Y-%m-%d']


# Function to try multiple formats
def parse_date(date_str):
    for fmt in date_formats:
        try:
            return datetime.datetime.strptime(date_str, fmt)
        except ValueError:
            continue
    return None  # In case no format matches


# Convert the 'dates' column to datetime using the datetime module and multiple formats
df['Haunted Places Date'] = df['Haunted Places Date'].apply(parse_date)


Fetching Wikipedia data: 100%|██████████| 5/5 [00:02<00:00,  2.42it/s]
  df.at[idx, 'extracted_date'] = results[i]



Date extraction results:
- Processed entries: 5
- Dates found: 4
- Success rate: 80.00%


*4d.) Adding the "Haunted Places Witness Count" feature to the dataset.*

In [None]:
import re
from thefuzz import fuzz

# Converting number words to integers
number_words = {
    "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
    "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10,
    "eleven": 11, "twelve": 12, "thirteen": 13, "fourteen": 14,
    "fifteen": 15, "sixteen": 16, "seventeen": 17, "eighteen": 18,
    "nineteen": 19, "twenty": 20
}

# Our keywords to indicate witnesses in the descriptions
witness_keywords = [
    "witness", "witnesses", "saw", "see", "seen", "report", "reported",
    "claim", "claims", "claimed", "account", "accounts", "experience",
    "encounter", "observe", "observed", "notice", "noticed", "spot",
    "spotted", "testify", "testified", "witnessed", "reports",
    "experiences", "sightings", "testimony", "described", "stories"
]

def word_to_number(word):
    return number_words.get(word.lower(), None)

def contains_keywords(text, keywords):
    return any(keyword in text for keyword in keywords)

# Rerunning the main function
def extract_fuzzy_context_aware_witness_count(description):
    if not isinstance(description, str):
        return 0  # If not a valid string, return 0

    description_lower = description.lower()

    # Extracting numbers (both words and digits), ignoring years
    matches = re.findall(r'\b(?:one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|\d{1,2})\s+\w+', description_lower)

    for match in matches:
        words = match.split()
        num_word = words[0]
        unit = " ".join(words[1:])  # Extracting unit following the number

        # Converting number word to integer if applicable
        num_value = int(num_word) if num_word.isdigit() else word_to_number(num_word)

        # Using fuzzy matching to check if the unit refers to people-related words
        people_units = ["witness", "witnesses", "person", "people", "individuals", "men", "women", "children", "kids", "onlookers", "bystanders", "observers"]
        for reference_unit in people_units:
            if fuzz.ratio(unit, reference_unit) > 80:  # High similarity threshold
                return num_value  # Returning the exact count if contextually relevant

    # If no numeric value was found but witness-related keywords exist, return 1
    if contains_keywords(description_lower, witness_keywords):
        return 1

# Applying the improved function to the dataset
df['Haunted Places Witness Count'] = df['description'].apply(extract_fuzzy_context_aware_witness_count)

# Calculating the total witness count
total_witness_count = df['Haunted Places Witness Count'].sum()
print(f"Total number of witnesses: {total_witness_count}")

# Calculating the witness ratio
total_rows = len(df)
witness_ratio = total_witness_count / total_rows
print(f"Proportion of Witness Evidence: {witness_ratio:.2%}")

df.head(50)

Total number of witnesses: 8808.0
Proportion of Witness Evidence: 80.13%


Unnamed: 0,city,country,description,location,state,state_abbrev,longitude,latitude,city_longitude,city_latitude,Haunted Places Date,Daylight Duration 2,Audio Evidence,Image/Video/Visual Evidence,Haunted Places Witness Count
0,Ada,United States,Ada witch - Sometimes you can see a misty blue...,Ada Cemetery,Michigan,MI,-85.504893,42.962106,-85.49548,42.960727,1954-03-12 00:00:00,11:44,True,True,1.0
1,Addison,United States,A little girl was killed suddenly while waitin...,North Adams Rd.,Michigan,MI,-84.381843,41.971425,-84.347168,41.986434,2006-07-07 00:00:00,15:06,True,True,1.0
2,Adrian,United States,If you take Gorman Rd. west towards Sand Creek...,Ghost Trestle,Michigan,MI,-84.035656,41.904538,-84.037166,41.897547,1929-05-30 00:00:00,14:58,False,True,1.0
3,Adrian,United States,"In the 1970's, one room, room 211, in the old ...",Siena Heights University,Michigan,MI,-84.017565,41.905712,-84.037166,41.897547,1826-06-18 00:00:00,15:14,True,True,1.0
4,Albion,United States,Kappa Delta Sorority - The Kappa Delta Sororit...,Albion College,Michigan,MI,-84.745177,42.244006,-84.75303,42.243097,1900-03-12 00:00:00,11:45,True,False,1.0
5,Albion,United States,A mysterious lady in white has been spotted in...,Riverside Cemetery,Michigan,MI,-84.753056,42.236814,-84.75303,42.243097,2025-03-01 00:00:00,11:16,False,False,1.0
6,Algoma Township,United States,On a winding dirt road next to the Rogue River...,Hell's Bridge,Michigan,MI,,,-85.62293,43.149293,1644-03-12 00:00:00,11:46,True,True,1.0
7,Algonac,United States,Morrow Road is a Haunted road in Algonac Michi...,Morrow Road,Michigan,MI,-82.57629,42.652997,-82.531018,42.618367,2017-07-12 00:00:00,15:04,True,True,1.0
8,Allegan,United States,"People report hearing footsteps, and doors sla...",Elks Lodge,Michigan,MI,-85.841599,42.520552,-85.855303,42.529199,2008-03-12 00:00:00,11:47,True,False,1.0
9,Allegan,United States,Various ghostly activities. News coverage abou...,The Grill House and the Rock Bottom Bar,Michigan,MI,-85.857564,42.497762,-85.855303,42.529199,2008-04-26 00:00:00,13:54,True,False,


*4e.) Adding "Time of Day" feature in the dataset.*

In [None]:
# Defining keywords for the new features
evening = ["evening", "night"]
morning = ["morning", "day"]
dusk = ["dusk", "twilight"]

# Function to check for keywords in description
def contains_time(description, evening, morning, dusk):
    if isinstance(description, str):
      if any(keyword.lower() in description.lower() for keyword in evening):
        return "Evening"

      elif any(keyword.lower() in description.lower() for keyword in morning):
        return "Morning"

      elif any(keyword.lower() in description.lower() for keyword in dusk):
        return "Dusk"
    return "Unknown"

# Adding new features based on keyword matches
df["Time of Day"] = df["description"].apply(lambda x: contains_time(x, evening, morning, dusk))

df.head(50)

Unnamed: 0,city,country,description,location,state,state_abbrev,longitude,latitude,city_longitude,city_latitude,Haunted Places Date,Daylight Duration 2,Audio Evidence,Image/Video/Visual Evidence,Haunted Places Witness Count,Time of Day
0,Ada,United States,Ada witch - Sometimes you can see a misty blue...,Ada Cemetery,Michigan,MI,-85.504893,42.962106,-85.49548,42.960727,1954-03-12 00:00:00,11:44,True,True,1.0,Evening
1,Addison,United States,A little girl was killed suddenly while waitin...,North Adams Rd.,Michigan,MI,-84.381843,41.971425,-84.347168,41.986434,2006-07-07 00:00:00,15:06,True,True,1.0,Morning
2,Adrian,United States,If you take Gorman Rd. west towards Sand Creek...,Ghost Trestle,Michigan,MI,-84.035656,41.904538,-84.037166,41.897547,1929-05-30 00:00:00,14:58,False,True,1.0,Evening
3,Adrian,United States,"In the 1970's, one room, room 211, in the old ...",Siena Heights University,Michigan,MI,-84.017565,41.905712,-84.037166,41.897547,1826-06-18 00:00:00,15:14,True,True,1.0,Evening
4,Albion,United States,Kappa Delta Sorority - The Kappa Delta Sororit...,Albion College,Michigan,MI,-84.745177,42.244006,-84.75303,42.243097,1900-03-12 00:00:00,11:45,True,False,1.0,Unknown
5,Albion,United States,A mysterious lady in white has been spotted in...,Riverside Cemetery,Michigan,MI,-84.753056,42.236814,-84.75303,42.243097,2025-03-01 00:00:00,11:16,False,False,1.0,Unknown
6,Algoma Township,United States,On a winding dirt road next to the Rogue River...,Hell's Bridge,Michigan,MI,,,-85.62293,43.149293,1644-03-12 00:00:00,11:46,True,True,1.0,Evening
7,Algonac,United States,Morrow Road is a Haunted road in Algonac Michi...,Morrow Road,Michigan,MI,-82.57629,42.652997,-82.531018,42.618367,2017-07-12 00:00:00,15:04,True,True,1.0,Evening
8,Allegan,United States,"People report hearing footsteps, and doors sla...",Elks Lodge,Michigan,MI,-85.841599,42.520552,-85.855303,42.529199,2008-03-12 00:00:00,11:47,True,False,1.0,Unknown
9,Allegan,United States,Various ghostly activities. News coverage abou...,The Grill House and the Rock Bottom Bar,Michigan,MI,-85.857564,42.497762,-85.855303,42.529199,2008-04-26 00:00:00,13:54,True,False,,Unknown


*4f.) Adding "Apparition Type" feature to dataset*

In [None]:
import pandas as pd
from thefuzz import fuzz

# Defining apparition types
app_keywords = {
    "Ghost": ["ghost", "spirit", "phantom", "shadow person", "revenant", "poltergeist", "doppelgänger", "spectral figure", "floating figure", "dark shadow", "glowing silhouette", "undead presence", "ghostly mist", "invisible entity", "ethereal presence", "demonic form", "wraith", "mystical vision", "vanishing figure", "otherworldly being", "translucent form", "disembodied figure", "whispering spirit", "paranormal shadow", "mysterious manifestation", "haunted specter", "phantasmic figure", "ghostly encounter", "unnatural shadow", "mystic entity", "half-seen figure", "dark silhouette", "shimmering ghost", "cold spot", "ectoplasm", "haunted", "haunting", "possession", "exorcism", "poltergeist activity", "spiritual presence"],
    "Orb": ["orb", "glowing orb", "floating light", "mysterious ball of light", "luminescent sphere", "spirit orb", "energy orb", "ethereal light", "hovering sphere", "blue orb", "red orb", "green orb", "golden orb", "pulsating light", "translucent ball", "moving light", "ghostly orb", "spiritual light", "small floating orb", "orb phenomenon", "mystical glow", "supernatural orb", "halo effect", "orbs of energy", "shimmering sphere", "radiant ball", "sacred light", "mystic orb", "orb cluster", "orb manifestation"],
    "UAP": ["ufo", "uap", "flying saucer", "disc-shaped craft", "metallic object", "unidentified object", "alien craft", "spaceship", "extraterrestrial vessel", "unknown aircraft", "glowing disk", "cigar-shaped object", "bright lights in sky", "strange lights", "hovering object", "mysterious aircraft", "UFO sighting", "abduction", "alien encounter", "mysterious orb", "blinking lights", "beam of light", "warp speed", "high-speed object", "rotating object", "silent craft", "otherworldly vehicle", "flashing lights", "invisible ship", "sudden disappearance", "triangle formation", "military cover-up", "unexplained radar signal", "cosmic traveler", "interdimensional being", "light formation"]
}

# Defining keywords for classifying ghosts
male_keywords = ["man", "male", "gentleman", "bearded", "cowboy", "he", "him", "father", "brother", "son", "husband", "king", "prince", "monk", "warrior", "soldier", "gunman", "captain", "lord", "knight", "chief", "baron", "duke", "sheriff", "pilot", "miner", "hunter", "ranger", "swordsman", "officer", "sergeant", "commander", "priest", "sailor", "drifter", "gunslinger"]
female_keywords = ["woman", "witch", "mother", "sister", "lady", "she", "her", "daughter", "wife", "queen", "princess", "nun", "widow", "mistress", "maiden", "bride", "fair maiden", "baroness", "duchess", "damsel", "empress", "fortune teller", "sorceress", "seer", "gypsy", "mystic", "healer", "goddess", "handmaiden", "enchantress", "priestess", "noblewoman", "ghost bride", "widowed wife"]
child_keywords = ["girl", "boy", "brother", "sister", "young", "children", "child", "kid", "baby", "infant", "toddler", "schoolboy", "schoolgirl", "orphan", "lost boy", "lost girl", "ghostly child", "young spirit", "playful spirit", "little one", "innocent", "abandoned child", "cursed child", "nursery rhyme", "laughing child", "crying baby", "ghostly laughter", "mischievous child", "young soul", "phantom child", "small footsteps", "wandering child", "lost soul", "runaway child"]

# Defining additional app-related words
ghost_keywords = ["ghost", "spirit", "phantom", "shadow person", "revenant", "poltergeist", "doppelgänger", "spectral figure", "floating figure", "dark shadow", "glowing silhouette", "undead presence", "ghostly mist", "invisible entity", "ethereal presence", "demonic form", "wraith", "mystical vision", "vanishing figure", "otherworldly being", "translucent form", "disembodied figure", "whispering spirit", "paranormal shadow", "mysterious manifestation", "haunted specter", "phantasmic figure", "ghostly encounter", "unnatural shadow", "mystic entity", "half-seen figure", "dark silhouette", "shimmering ghost", "cold spot", "ectoplasm", "haunted", "haunting", "possession", "exorcism", "poltergeist activity", "spiritual presence"]
orb_keywords = ["orb", "glowing orb", "floating light", "mysterious ball of light", "luminescent sphere", "spirit orb", "energy orb", "ethereal light", "hovering sphere", "blue orb", "red orb", "green orb", "golden orb", "pulsating light", "translucent ball", "moving light", "ghostly orb", "spiritual light", "small floating orb", "orb phenomenon", "mystical glow", "supernatural orb", "halo effect", "orbs of energy", "shimmering sphere", "radiant ball", "sacred light", "mystic orb", "orb cluster", "orb manifestation"]
ufo_uap_keywords = ["ufo", "uap", "flying saucer", "disc-shaped craft", "metallic object", "unidentified object", "alien craft", "spaceship", "extraterrestrial vessel", "unknown aircraft", "glowing disk", "cigar-shaped object", "bright lights in sky", "strange lights", "hovering object", "mysterious aircraft", "UFO sighting", "abduction", "alien encounter", "mysterious orb", "blinking lights", "beam of light", "warp speed", "high-speed object", "rotating object", "silent craft", "otherworldly vehicle", "flashing lights", "invisible ship", "sudden disappearance", "triangle formation", "military cover-up", "unexplained radar signal", "cosmic traveler", "interdimensional being", "light formation"]


# Function to perform fuzzy matching
def fuzzy_match(description, keywords, threshold=80):
    if isinstance(description, str):
        for keyword in keywords:
            if fuzz.partial_ratio(keyword.lower(), description.lower()) >= threshold:
                return keyword  # Return the matched keyword
    return "Unknown"

# Function to detect apparition type using fuzzy matching
def contains_app(description):
    for app_category, keywords in app_keywords.items():
        matched_keyword = fuzzy_match(description, keywords)
        if matched_keyword != "Unknown":
            return app_category  # Returns the apparition type like "Ghost", "Orb", "UAP"
    return "Unknown"


# If Ghost is detected, classifying as Male, Female, or Child
def fuzzy_classify_ghost(description):
    if fuzzy_match(description, child_keywords) != "Unknown":
        if fuzzy_match(description, female_keywords) != "Unknown":
            return "Female Child"
        elif fuzzy_match(description, male_keywords) != "Unknown":
            return "Male Child"
        return "Child"
    elif fuzzy_match(description, male_keywords) != "Unknown":
        return "Male"
    elif fuzzy_match(description, female_keywords) != "Unknown":
        return "Female"
    return "Unknown"

# Applying apparition type classification using fuzzy matching
df["Apparition Type"] = df["description"].apply(contains_app)

# Applying ghost classification for those categorized as Ghosts
df["Ghost Category"] = df.apply(
    lambda x: fuzzy_classify_ghost(x["description"]) if x["Apparition Type"] == "Ghost" else "N/A", axis=1
)

# Counting occurrences of each apparition type
apparition_counts = df["Apparition Type"].value_counts()

# Counting occurrences of Ghost subcategories
ghost_category_counts = df[df["Apparition Type"] == "Ghost"]["Ghost Category"].value_counts()

# Calculating proportion of known apparition types
total_rows = len(df)
known_apparitions_count = total_rows - apparition_counts.get("Unknown", 0)
known_apparitions_ratio = known_apparitions_count / total_rows

# Printing results
print("Apparition Type Counts:")
print(apparition_counts)

print("\nGhost Category Counts:")
print(ghost_category_counts)

print(f"\nProportion of known apparition types (excluding 'Unknown'): {known_apparitions_ratio:.2%}")

# Displaying the first 50 rows
df.head(50)

Apparition Type Counts:
Apparition Type
Ghost      5256
Unknown    5016
UAP         406
Orb         314
Name: count, dtype: int64

Ghost Category Counts:
Ghost Category
Male            3016
Female Child    2015
Male Child       168
Unknown           35
Female            16
Child              6
Name: count, dtype: int64

Proportion of known apparition types (excluding 'Unknown'): 54.37%


Unnamed: 0,city,country,description,location,state,state_abbrev,longitude,latitude,city_longitude,city_latitude,Haunted Places Date,Daylight Duration 2,Audio Evidence,Image/Video/Visual Evidence,Haunted Places Witness Count,Time of Day,Apparition Type,Ghost Category
0,Ada,United States,Ada witch - Sometimes you can see a misty blue...,Ada Cemetery,Michigan,MI,-85.504893,42.962106,-85.49548,42.960727,1954-03-12 00:00:00,11:44,True,True,1.0,Evening,Ghost,Male
1,Addison,United States,A little girl was killed suddenly while waitin...,North Adams Rd.,Michigan,MI,-84.381843,41.971425,-84.347168,41.986434,2006-07-07 00:00:00,15:06,True,True,1.0,Morning,Ghost,Female Child
2,Adrian,United States,If you take Gorman Rd. west towards Sand Creek...,Ghost Trestle,Michigan,MI,-84.035656,41.904538,-84.037166,41.897547,1929-05-30 00:00:00,14:58,False,True,1.0,Evening,Ghost,Female Child
3,Adrian,United States,"In the 1970's, one room, room 211, in the old ...",Siena Heights University,Michigan,MI,-84.017565,41.905712,-84.037166,41.897547,1826-06-18 00:00:00,15:14,True,True,1.0,Evening,Unknown,
4,Albion,United States,Kappa Delta Sorority - The Kappa Delta Sororit...,Albion College,Michigan,MI,-84.745177,42.244006,-84.75303,42.243097,1900-03-12 00:00:00,11:45,True,False,1.0,Unknown,Ghost,Female Child
5,Albion,United States,A mysterious lady in white has been spotted in...,Riverside Cemetery,Michigan,MI,-84.753056,42.236814,-84.75303,42.243097,2025-03-01 00:00:00,11:16,False,False,1.0,Unknown,UAP,
6,Algoma Township,United States,On a winding dirt road next to the Rogue River...,Hell's Bridge,Michigan,MI,,,-85.62293,43.149293,1644-03-12 00:00:00,11:46,True,True,1.0,Evening,Unknown,
7,Algonac,United States,Morrow Road is a Haunted road in Algonac Michi...,Morrow Road,Michigan,MI,-82.57629,42.652997,-82.531018,42.618367,2017-07-12 00:00:00,15:04,True,True,1.0,Evening,Ghost,Female Child
8,Allegan,United States,"People report hearing footsteps, and doors sla...",Elks Lodge,Michigan,MI,-85.841599,42.520552,-85.855303,42.529199,2008-03-12 00:00:00,11:47,True,False,1.0,Unknown,Unknown,
9,Allegan,United States,Various ghostly activities. News coverage abou...,The Grill House and the Rock Bottom Bar,Michigan,MI,-85.857564,42.497762,-85.855303,42.529199,2008-04-26 00:00:00,13:54,True,False,,Unknown,Ghost,Male


*4g.) Adding "Event Type" feature to dataset*

In [None]:
import pandas as pd
from thefuzz import fuzz

# Defining expanded keywords for tragic events
event_keywords = {
    "Murder": ["murder", "killed", "homicide", "slain", "stabbed", "shot", "brutally murdered",
               "executed", "lynched", "assassinated", "gunned down", "beheaded", "massacre"],
    "Death": ["death", "died", "passed away", "fatal accident", "perished", "lost their life",
              "tragic end", "corpse", "found dead", "met their end"],
    "Suicide": ["suicide", "took their own life", "self-inflicted", "hung themselves",
                "jumped to death", "ended their life", "overdosed"],
    "Disappearance": ["missing", "vanished", "disappeared", "never found", "last seen", "mystery disappearance"],
    "Possession": ["possessed", "demonic possession", "evil spirit took over", "exorcism required",
                   "spiritual attack", "supernatural control"],
    "Haunting": ["haunted by", "cursed by", "tormented spirit", "revengeful ghost", "paranormal curse",
                 "unsettled spirit"]
}

# Function to classify event type using fuzzy matching
def classify_event_type(description, threshold=80):
    if isinstance(description, str):
        for event, keywords in event_keywords.items():
            for keyword in keywords:
                if fuzz.partial_ratio(keyword.lower(), description.lower()) >= threshold:
                    return event  # Return the matched event category
    return "Unknown"

# Applying event classification using fuzzy matching
df["Event Type"] = df["description"].apply(classify_event_type)

# Counting occurrences of each event type
event_counts = df["Event Type"].value_counts()

# Calculating proportion of known event types
total_rows = len(df)
known_events_count = total_rows - event_counts.get("Unknown", 0)
known_events_ratio = known_events_count / total_rows

# Printing results
print("Event Type Counts:")
print(event_counts)

print(f"\nProportion of known event types (excluding 'Unknown'): {known_events_ratio:.2%}")

# Displaying the first 50 rows
df.head(50)

Event Type Counts:
Event Type
Unknown          4578
Murder           2883
Death            1895
Haunting          813
Disappearance     526
Suicide           287
Possession         10
Name: count, dtype: int64

Proportion of known event types (excluding 'Unknown'): 58.35%


Unnamed: 0,city,country,description,location,state,state_abbrev,longitude,latitude,city_longitude,city_latitude,Haunted Places Date,Daylight Duration 2,Audio Evidence,Image/Video/Visual Evidence,Haunted Places Witness Count,Time of Day,Apparition Type,Ghost Category,Event Type
0,Ada,United States,Ada witch - Sometimes you can see a misty blue...,Ada Cemetery,Michigan,MI,-85.504893,42.962106,-85.49548,42.960727,1954-03-12 00:00:00,11:44,True,True,1.0,Evening,Ghost,Male,Murder
1,Addison,United States,A little girl was killed suddenly while waitin...,North Adams Rd.,Michigan,MI,-84.381843,41.971425,-84.347168,41.986434,2006-07-07 00:00:00,15:06,True,True,1.0,Morning,Ghost,Female Child,Murder
2,Adrian,United States,If you take Gorman Rd. west towards Sand Creek...,Ghost Trestle,Michigan,MI,-84.035656,41.904538,-84.037166,41.897547,1929-05-30 00:00:00,14:58,False,True,1.0,Evening,Ghost,Female Child,Murder
3,Adrian,United States,"In the 1970's, one room, room 211, in the old ...",Siena Heights University,Michigan,MI,-84.017565,41.905712,-84.037166,41.897547,1826-06-18 00:00:00,15:14,True,True,1.0,Evening,Unknown,,Murder
4,Albion,United States,Kappa Delta Sorority - The Kappa Delta Sororit...,Albion College,Michigan,MI,-84.745177,42.244006,-84.75303,42.243097,1900-03-12 00:00:00,11:45,True,False,1.0,Unknown,Ghost,Female Child,Death
5,Albion,United States,A mysterious lady in white has been spotted in...,Riverside Cemetery,Michigan,MI,-84.753056,42.236814,-84.75303,42.243097,2025-03-01 00:00:00,11:16,False,False,1.0,Unknown,UAP,,Unknown
6,Algoma Township,United States,On a winding dirt road next to the Rogue River...,Hell's Bridge,Michigan,MI,,,-85.62293,43.149293,1644-03-12 00:00:00,11:46,True,True,1.0,Evening,Unknown,,Murder
7,Algonac,United States,Morrow Road is a Haunted road in Algonac Michi...,Morrow Road,Michigan,MI,-82.57629,42.652997,-82.531018,42.618367,2017-07-12 00:00:00,15:04,True,True,1.0,Evening,Ghost,Female Child,Murder
8,Allegan,United States,"People report hearing footsteps, and doors sla...",Elks Lodge,Michigan,MI,-85.841599,42.520552,-85.855303,42.529199,2008-03-12 00:00:00,11:47,True,False,1.0,Unknown,Unknown,,Unknown
9,Allegan,United States,Various ghostly activities. News coverage abou...,The Grill House and the Rock Bottom Bar,Michigan,MI,-85.857564,42.497762,-85.855303,42.529199,2008-04-26 00:00:00,13:54,True,False,,Unknown,Ghost,Male,Unknown


*4h.) Joining the "Alcohol Abuse by State" dataset*

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# URL of the webpage to scrape
url = "https://drugabusestatistics.org/alcohol-abuse-statistics/"

# Sending a GET request to the webpage
response = requests.get(url)
response.raise_for_status()  # Check for request errors

# Parsing the webpage content
soup = BeautifulSoup(response.text, 'html.parser')

# Defining list of unwanted headings (non-state sections)
unwanted_headings = {
    "Highlights", "Alcohol Abuse Statistics", "Alcohol Abuse & COVID-19",
    "Alcohol Abuse & Children", "Analysis: Emerging Trends in Alcohol Abuse",
    "Alcohol-Related Illness and Death", "Alcohol Deaths & Demographics",
    "Alcohol Abuse & Death by State"
}

# Initializing lists to store state names and values
states = []
percentages = []
per_capita = []
death_counts = []

# Regex patterns
percentage_pattern = r"(\d+\.?\d*)%"  # Extracts percentage values
number_pattern = r"(\d+\.?\d*)"  # Extracts the first number in a sentence
death_pattern = r"(\d{1,3}(?:,\d{3})*)"  # Extracts full numbers with commas (e.g., 2,208)

# Finding all state sections (usually inside <h2> or <h3> tags)
state_headers = soup.find_all(['h2', 'h3'])

for header in state_headers:
    state_name = header.text.strip()  # Extract the state name

    # Skipping unwanted non-state sections
    if state_name in unwanted_headings:
        continue

    # Extracting only the state name (if extra text exists)
    state_name = state_name.split("Alcohol")[0].strip()

    bullet_list = header.find_next("ul")  # Find the <ul> that follows the state name

    if bullet_list:
        bullets = bullet_list.find_all("li")  # Get all <li> elements inside <ul>

        # Extracting the first bullet (percentage of people binge drinking)
        percentage = None
        if len(bullets) > 0:
            match = re.search(percentage_pattern, bullets[0].text)
            if match:
                percentage = match.group(1) + "%"

        # Extracting the second bullet (per capita alcohol deaths)
        per_capita_value = None
        if len(bullets) > 1:
            match = re.search(number_pattern, bullets[1].text)  # Get the first number
            if match:
                per_capita_value = match.group(1)  # Extract the numeric part

        # Extracting the fourth bullet (annual deaths)
        death_count = None
        if len(bullets) >= 4:
            match = re.search(death_pattern, bullets[3].text)  # Fourth bullet
            if match:
                death_count = match.group(1).replace(",", "")  # Extract full number and remove commas

        if percentage or per_capita_value or death_count:
            states.append(state_name)
            percentages.append(percentage if percentage else "N/A")
            per_capita.append(per_capita_value if per_capita_value else "N/A")
            death_counts.append(death_count if death_count else "N/A")

# Creating a DataFrame
alc_df = pd.DataFrame({
    "state": states,
    "Over 18 binge drink at least once per month": percentages,
    "Alcohol deaths per capita": per_capita,
    "Annual deaths attributable to excessive alcohol use": death_counts
})

# Saving the DataFrame to a CSV file
# alc_df.to_csv("alcohol_binge_drinking_deaths_per_capita.csv", index=False)

print("Scraped data successfully saved to 'alcohol_binge_drinking_deaths_per_capita.csv'.")
df = pd.merge(df, alc_df, on='state', how='left')

Scraped data successfully saved to 'alcohol_binge_drinking_deaths_per_capita.csv'.


*4i.) Joining the "Amount of Daylight by State" dataset.*

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


# URL of the timeanddate page
url = "https://www.timeanddate.com/astronomy/usa"


# Send a request to the webpage
response = requests.get(url)
response.raise_for_status()  # Check for errors


# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')


# Find the table containing the sunrise/sunset data
table = soup.find("table", class_="tb-sm")


# Initialize lists to store scraped data
locations = []
sunrise_times = []
sunset_times = []

def time_to_minutes(time_str):
    hours, minutes = map(int, time_str.split(':'))
    return hours * 60 + minutes


# Function to convert total minutes back to time string
def minutes_to_time(minutes):
    return f"{minutes // 60:02}:{minutes % 60:02}"


# Iterate over each row in the table (excluding the header row)
for row in table.find_all("tr")[1:]:
    cols = row.find_all("td")
    # Process data in groups of 3 (Location, Sunrise, Sunset)
    for i in range(0, len(cols), 3):
        if i + 2 < len(cols):  # Ensure enough columns exist
            location = cols[i].text.strip()
            sunrise = cols[i + 1].text.strip().replace("↑", "").replace("am", "").replace("pm", "").strip()
            sunset = cols[i + 2].text.strip().replace("↓", "").replace("am", "").replace("pm", "").strip()


            locations.append(location)
            sunrise_times.append(sunrise)
            sunset_times.append(sunset)


# Create a DataFrame
df_daylight = pd.DataFrame({
    "Location": locations,
    "Sunrise": sunrise_times,
    "Sunset": sunset_times
})


# Function to extract state abbreviation
def extract_state_abbr(address):
    match = re.search(r'\((\w{2})\)', address)
    return match.group(1) if match else None


# Apply function to create new column


df_daylight['state_abbrev'] = df_daylight['Location'].apply(extract_state_abbr)
df_daylight = df_daylight.dropna()
df_daylight = df_daylight.drop_duplicates(subset='state_abbrev')
df_daylight['Sunset'] = (df_daylight['Sunset'].map(time_to_minutes) + 12 * 60).map(minutes_to_time)
print(df_daylight)


# Convert sunrise and sunset to numeric values
df_daylight['Sunrise'] = df_daylight['Sunrise'].map(time_to_minutes)
df_daylight['Sunset'] = df_daylight['Sunset'].map(time_to_minutes)
# Calculate daylight duration
df_daylight['Daylight Duration'] = df_daylight['Sunset'] - df_daylight['Sunrise']
# Convert daylight duration back to time format
df_daylight['Sunrise'] = df_daylight['Sunrise'].map(minutes_to_time)
df_daylight['Sunset'] = df_daylight['Sunset'].map(minutes_to_time)
df_daylight['Daylight Duration'] = df_daylight['Daylight Duration'].map(minutes_to_time)

df_daylight = df_daylight.drop(columns='Location')
df = pd.merge(df, df_daylight, on='state_abbrev', how='left')
print(df.head())

               Location Sunrise Sunset state_abbrev
0             Adak (AK)    9:03  20:48           AK
1       Harrisburg (PA)    7:21  19:12           PA
3           Albany (NY)    7:09  18:59           NY
4         Hartford (CT)    7:05  18:55           CT
5          Phoenix (AZ)    6:40  18:34           AZ
6      Albuquerque (NM)    7:19  19:12           NM
7           Helena (MT)    7:44  19:31           MT
8           Pierre (SD)    7:56  19:45           SD
9             Ames (IA)    7:29  19:19           IA
10        Honolulu (HI)    6:41  18:40           HI
11        Portland (OR)    7:26  19:14           OR
13         Houston (TX)    7:33  19:28           TX
14         Raleigh (NC)    7:27  19:20           NC
15       Annapolis (MD)    7:19  19:11           MD
16    Indianapolis (IN)    7:58  19:49           IN
17        Richmond (VA)    7:23  19:15           VA
18         Atlanta (GA)    7:50  19:44           GA
19         Jackson (MS)    7:13  19:07           MS
20      Sacr

In [None]:
# Warning: this code takes about 35 minutes to run fully
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm  # Import tqdm


# Use caching to store results for repeated locations
@lru_cache(maxsize=1000)
def get_daylight_data(year, latitude, longitude):
    """Fetch daylight duration data for a given latitude and longitude."""
    if year < 1700:
        year = 2025

    url = f'https://aa.usno.navy.mil/calculated/durdaydark?year={year}&task=0&lat={latitude}&lon={longitude}&label=Location&tz=8&tz_sign=-1&submit=Get+Data'


    try:
        with requests.Session() as session:
            response = session.get(url, timeout=10)  # Adding a timeout to avoid hanging requests
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {latitude}, {longitude}: {e}")
        return pd.DataFrame()  # Return empty DataFrame on error


    soup = BeautifulSoup(response.text, 'html.parser')
    table1 = soup.find('table')
    table = table1.find_next_sibling()


    # Extract headers
    headers = [th.text.strip() for th in table.find('tr').find_all('td')]


    # Extract table rows
    data = [[col.text.strip() for col in row.find_all('td')] for row in table.find_all('tr')[1:]]


    # Create DataFrame
    new_df = pd.DataFrame(data)


    return new_df


def get_daylight_duration(latitude, longitude, dt):
    """Retrieve daylight duration for a specific latitude, longitude, and date."""
    year = dt.year
    new_df = get_daylight_data(year, latitude, longitude)


    # Extract month & day
    col_index = dt.month
    row_index = dt.day


    if row_index >= len(new_df) or col_index >= len(new_df.columns):
        return "Invalid date or data unavailable."


    return new_df.iloc[row_index, col_index]


def calculate_daylight(index, row):
    """Compute daylight duration efficiently."""
    latitude = round(row["latitude"] if pd.notna(row["latitude"]) else (row["city_latitude"] if pd.notna(row["city_latitude"]) else 34.08), 2)
    longitude = round(row["longitude"] if pd.notna(row["longitude"]) else (row["city_longitude"] if pd.notna(row["city_longitude"]) else -118.37), 2)


    daylight_duration = get_daylight_duration(latitude, longitude, row["Haunted Places Date"])
    if index % 50 == 0:
        print(f"Processed row {index + 1}: Latitude = {latitude}, Longitude = {longitude}, Daylight Duration = {daylight_duration}")
    return daylight_duration


# Get total number of rows for progress tracking
total_rows = len(df)


# Create a tqdm progress bar
with tqdm(total=total_rows, desc="Processing rows", unit="row") as pbar:
    results = []

    # Parallel execution using ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=5) as executor:  # Reduced max_workers for stability
        # Submitting tasks to executor
        futures = {executor.submit(calculate_daylight, i, row): i for i, row in df.iterrows()}


        for future in as_completed(futures):
            index = futures[future]
            try:
                result = future.result()
                results.append((index, result))
            except Exception as e:
                print(f"Error processing row {index + 1}: {e}")
                results.append((index, "Error"))

            # Update progress bar after each completed task
            pbar.update(1)


# Sort results to maintain correct row order
results.sort(key=lambda x: x[0])
df["Daylight Duration 2"] = [res[1] for res in results]

print("Finished processing all rows!")


Processing rows:   0%|          | 1/10992 [00:02<6:11:12,  2.03s/row]

Processed row 1: Latitude = 42.96, Longitude = -85.5, Daylight Duration = 11:44


Processing rows:   0%|          | 30/10992 [00:07<47:34,  3.84row/s]


KeyboardInterrupt: 

**Task #5: Identifying 3 other datasets with different top level MIME types.**



* **Dataset #1:** Crime Data (text/CSV)
* **Dataset #2:** Census Data (application/xml)
* **Dataset #3:** Moon Phase Data (/)



*Dataset #1: Crime Dataset (text/CSV)*

In [None]:
# !pip install kagglehub
import kagglehub as kh
import os

# Downloading latest version
path = kh.dataset_download("mrayushagrawal/us-crime-dataset")
print("Path to dataset files:", path)

# Getting the dataset download path
path = kh.dataset_download("mrayushagrawal/us-crime-dataset")

# Listing files in the dataset directory
print("Files in dataset directory:", os.listdir(path))

# Updating with actual CSV filename found in the previous step
csv_file_path = os.path.join(path, "US_Crime_DataSet.csv")  # Adjust filename as needed

# Loading the CSV file
csv_df = pd.read_csv(csv_file_path)

# Showing first few rows
print(csv_df.head())

# Showing shape of data
print(csv_df.shape)

csv_selected = csv_df[["Crime Solved", "Victim Sex", "Weapon", "City"]]  # Replace with actual column names
csv_selected = csv_selected.rename(columns={
   "Crime Solved": "crime_solved",
   "Victim Sex": "victim_sex",
   "Weapon": "weapon",
   "City": "city"
})

# Showing selected data
print(csv_selected.head())

csv_selected = csv_selected.drop_duplicates(subset=["city"])
df = pd.merge(df, csv_selected, on="city", how="left")  # Changing "common_column"
df = df.drop(columns=["Unnamed: 0"])
print(df.head())

Path to dataset files: /root/.cache/kagglehub/datasets/mrayushagrawal/us-crime-dataset/versions/2
Files in dataset directory: ['US_Crime_DataSet.csv']


  csv_df = pd.read_csv(csv_file_path)


   Record ID Agency Code Agency Name       Agency Type       City   State  \
0          1     AK00101   Anchorage  Municipal Police  Anchorage  Alaska   
1          2     AK00101   Anchorage  Municipal Police  Anchorage  Alaska   
2          3     AK00101   Anchorage  Municipal Police  Anchorage  Alaska   
3          4     AK00101   Anchorage  Municipal Police  Anchorage  Alaska   
4          5     AK00101   Anchorage  Municipal Police  Anchorage  Alaska   

   Year    Month  Incident              Crime Type  ... Victim Ethnicity  \
0  1980  January         1  Murder or Manslaughter  ...          Unknown   
1  1980    March         1  Murder or Manslaughter  ...          Unknown   
2  1980    March         2  Murder or Manslaughter  ...          Unknown   
3  1980    April         1  Murder or Manslaughter  ...          Unknown   
4  1980    April         2  Murder or Manslaughter  ...          Unknown   

  Perpetrator Sex  Perpetrator Age               Perpetrator Race  \
0          

*Dataset #2: Census Data (application/xml)*

In [None]:
import requests
import pandas as pd

# API URL for city-level data (Population, Median Age, Housing Units)
url = "https://api.census.gov/data/2021/acs/acs5/profile?get=NAME,DP05_0018E,DP05_0001E,DP04_0002E&for=place:*"

response = requests.get(url)
data = response.json()

# Convert to DataFrame
columns = data[0]  # First row contains column names
rows = data[1:]    # Remaining rows contain data
df_census = pd.DataFrame(rows, columns=columns)

# Rename columns for clarity
df_census.rename(columns={
    "NAME": "city",
    "DP05_0001E": "Population",
    "DP05_0018E": "Median Age",
    "DP04_0002E": "Housing Units"
}, inplace=True)

df_census.head()

Unnamed: 0,city,Median Age,Population,Housing Units,state,place
0,"Abanda CDP, Alabama",18.9,231,67,1,100
1,"Abbeville city, Alabama",51.5,2231,922,1,124
2,"Adamsville city, Alabama",44.9,4381,1490,1,460
3,"Addison town, Alabama",37.6,697,296,1,484
4,"Akron town, Alabama",37.5,385,92,1,676


In [None]:
# Clean up census DataFrame
df_census["city"] = df_census["city"].str.split(", ").str[0]
df_census = df_census.drop(columns=["state", "place"])

In [None]:
import re

# Function to clean city names
def clean_city_name(city):
   return re.sub(r"\s+(city|town|village|borough|CDP)$", "", city, flags=re.IGNORECASE).strip()


# Apply to Census dataset
df_census["city"] = df_census["city"].apply(clean_city_name)

df_census["city"] = df_census["city"].str.lower().str.strip()
# new_df["city"] = new_df["city"].str.lower().str.strip()
df_census["city"] = df_census["city"].apply(lambda x: x.title())
df_census.head()
df_census = df_census.drop_duplicates(subset=["city"], keep="first")


In [None]:
# Merging datasets on "city"
df = pd.merge(df, df_census, on="city", how="left")
df.head()

Unnamed: 0,city,country,description,location,state,state_abbrev,longitude,latitude,city_longitude,city_latitude,...,Annual deaths attributable to excessive alcohol use,Sunrise,Sunset,Daylight Duration,crime_solved,victim_sex,weapon,Median Age,Population,Housing Units
0,Ada,United States,Ada witch - Sometimes you can see a misty blue...,Ada Cemetery,Michigan,MI,-85.504893,42.962106,-85.49548,42.960727,...,4548.0,07:53,19:42,11:49,No,Female,Knife,60.1,48,31
1,Addison,United States,A little girl was killed suddenly while waitin...,North Adams Rd.,Michigan,MI,-84.381843,41.971425,-84.347168,41.986434,...,4548.0,07:53,19:42,11:49,No,Male,Unknown,37.6,697,296
2,Adrian,United States,If you take Gorman Rd. west towards Sand Creek...,Ghost Trestle,Michigan,MI,-84.035656,41.904538,-84.037166,41.897547,...,4548.0,07:53,19:42,11:49,,,,49.2,687,268
3,Adrian,United States,"In the 1970's, one room, room 211, in the old ...",Siena Heights University,Michigan,MI,-84.017565,41.905712,-84.037166,41.897547,...,4548.0,07:53,19:42,11:49,,,,49.2,687,268
4,Albion,United States,Kappa Delta Sorority - The Kappa Delta Sororit...,Albion College,Michigan,MI,-84.745177,42.244006,-84.75303,42.243097,...,4548.0,07:53,19:42,11:49,,,,21.4,274,71


*Dataset #3: Moon Phase Data*

In [None]:
import requests
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import time
import random


# ---------------------- Fetch Moon Data for Each Date ----------------------
MOON_API_BASE = "https://svs.gsfc.nasa.gov/api/dialamoon/"


# Default moon data for 2011-01-01
default_moon_data = {
    "phase": 12.35,  # Correct phase for 2011-01-01
    "diameter": 1889.5,  # Correct diameter for 2011-01-01 in km
    "distance": 379317.0,  # Correct distance for 2011-01-01 in km
}


# Cache to store moon data for already processed dates
moon_data_cache = {}


def get_moon_data(date, retries=3):
    """Fetch moon phase, diameter, and distance from NASA API with retries."""
    if pd.isnull(date):
        return "Unknown", "Unknown", "Unknown"


    # Check if the date is already in the cache
    if date in moon_data_cache:
        print(f"Using cached data for date {date}")
        return moon_data_cache[date]["phase"], moon_data_cache[date]["diameter"], moon_data_cache[date]["distance"]


    # Check if the date is before 2011-01-01
    if date < pd.Timestamp("2011-01-01"):
        print(f"Using default data for date {date}")
        # Cache the default data for the date
        moon_data_cache[date] = default_moon_data
        return default_moon_data["phase"], default_moon_data["diameter"], default_moon_data["distance"]


    formatted_date = date.strftime("%Y-%m-%dT%H:%M")
    url = f"{MOON_API_BASE}{formatted_date}"


    attempt = 0
    while attempt < retries:
        try:
            response = requests.get(url, timeout=20, verify=False)
            if response.status_code == 200:
                moon_data = response.json()
                # Cache the fetched data for the date
                moon_data_cache[date] = {
                    "phase": moon_data.get("phase", "Unknown"),
                    "diameter": moon_data.get("diameter", "Unknown"),
                    "distance": moon_data.get("distance", "Unknown")
                }
                return moon_data_cache[date]["phase"], moon_data_cache[date]["diameter"], moon_data_cache[date]["distance"]
            else:
                print(f"Error with status code {response.status_code} for date {formatted_date}")
                return "Error", "Error", "Error"
        except Exception as e:
            print(f"Error for date {formatted_date}: {e}")
            attempt += 1
            time.sleep(2 ** attempt + random.uniform(0, 1))  # Exponential backoff

    return "Error", "Error", "Error"


# Function to fetch moon data in parallel
def fetch_moon_data_parallel(dates):
    with ThreadPoolExecutor(max_workers=4) as executor:  # Reduce max_workers for fewer concurrent requests
        results = list(tqdm(executor.map(get_moon_data, dates), total=len(dates)))
    return results


# ---------------------- Parallel Data Fetching ----------------------
# Extract dates from your dataset
dates = pd.to_datetime(df["Haunted Places Date"], errors="coerce")


# Fetch moon data in parallel
moon_data_results = fetch_moon_data_parallel(dates)


# Convert the results into a DataFrame and join them with the original DataFrame
moon_data_df = pd.DataFrame(moon_data_results, columns=["moon_phase", "moon_diameter", "moon_distance"])
df = pd.concat([df, moon_data_df], axis=1)

# Display Updated Dataset
print(df.head())


Using default data for date 1954-03-12 00:00:00
Using default data for date 2006-07-07 00:00:00
Using default data for date 1929-05-30 00:00:00
Using default data for date 1826-06-18 00:00:00
Using default data for date 1900-03-12 00:00:00
Using default data for date 2008-03-12 00:00:00
Using default data for date 2008-04-26 00:00:00
Using default data for date 1999-03-12 00:00:00
Using default data for date 1901-03-12 00:00:00


  0%|          | 6/10992 [00:00<09:04, 20.17it/s]

Using default data for date 1830-03-12 00:00:00
Using default data for date 1910-03-12 00:00:00
Using default data for date 1875-03-12 00:00:00
Using default data for date 1819-03-12 00:00:00
Using default data for date 1913-03-12 00:00:00
Using default data for date 1904-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1914-03-12 00:00:00
Using default data for date 1802-09-20 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1954-03-12 00:00:00
Using default data for date 1974-09-10 00:00:00
Using cached data for date 2008-03-12 00:00:00
Using default data for date 1960-03-12 00:00:00
Using default data for date 1807-03-12 00:00:00
Using default data for date 1777-03-12 00:00:00
Using default data for date 1998-03-12 00:00:00
Using default data for date 1827-03-12 00:00:00
Using default data for date 1882-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1981-03-12 00:00:

  0%|          | 32/10992 [00:00<03:44, 48.79it/s]

Using default data for date 2009-07-19 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1920-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1787-03-12 00:00:00
Using default data for date 1857-03-12 00:00:00
Using cached data for date 1910-03-12 00:00:00
Using default data for date 1818-03-12 00:00:00
Using cached data for date 1818-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1974-03-12 00:00:00
Using default data for date 1912-03-12 00:00:00
Using default data for date 1965-03-12 00:00:00
Using default data for date 1891-03-12 00:00:00
Using default data for date 1692-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1960-08-31 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1818-03-12 00:00:00
Us

  2%|▏         | 237/10992 [00:01<00:32, 335.72it/s]

Using cached data for date 1900-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1812-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1996-03-12 00:00:00
Using cached data for date 1980-03-12 00:00:00
Using cached data for date 1812-03-12 00:00:00
Using cached data for date 1877-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1822-03-12 00:00:00
Using cached data for date 2006-07-07 00:00:00
Using cached data for date 1911-03-12 00:00:00
Using cached data for date 1911-03-12 00:00:00
Using default data for date 1824-03-12 00:00:00
Using default data for date 1784-03-12 00:00:00
Using cached data for date 1958-03-12 00:00:00
Using default data for date 1790-03-12 00:00:00
Using cached data for date 1870-03-12 00:00:00
Using cached data for date 1824-03-12 00:00:00
Using default data for date 1975-03-12 00:00:00
Using d

  3%|▎         | 310/10992 [00:01<00:33, 320.95it/s]

Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1993-03-12 00:00:00
Using default data for date 1833-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1975-03-12 00:00:00
Using cached data for date 1999-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1902-03-12 00:00:00
Using cached data for date 1818-03-12 00:00:00
Using default data for date 1937-03-12 00:00:00
Using default data for date 1785-03-12 00:00:00
Using cached data for date 1911-03-12 00:00:00
Using cached data for date 1844-03-12 00:00:00
Using cached data for date 1844-03-12 00:00:00
Using cached data for date 1789-03-12 00:00:00
Using default data for date 1942-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1870-03-12 00:00:00
Using cached data for date 1910-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using de

  4%|▍         | 490/10992 [00:01<00:26, 390.87it/s]

Using default data for date 1978-07-11 00:00:00
Using cached data for date 1797-03-12 00:00:00
Using cached data for date 1986-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1864-03-12 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using cached data for date 1870-03-12 00:00:00
Using cached data for date 1797-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1935-03-12 00:00:00
Using default data for date 1808-03-12 00:00:00
Using cached data for date 1832-03-12 00:00:00
Using default data for date 1927-03-12 00:00:00
Using cached data for date 1773-02-09 00:00:00
Using default data for date 1894-04-14 00:00:00
Using cached data for date 2008-03-12 00:00:00
Using cached data for date 1961-03-12 00:00:00
Using cached data for date 1830-03-12 00:00:00
Using cached data for date 1886-03-12 00:00:00
Using default data for date 1801-03-12 00:00:00
Using ca

  5%|▌         | 554/10992 [00:02<00:37, 277.48it/s]

Using default data for date 1689-03-12 00:00:00
Using cached data for date 2025-03-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1828-03-12 00:00:00
Using cached data for date 1910-03-12 00:00:00
Using cached data for date 1901-03-12 00:00:00
Using default data for date 1755-07-09 00:00:00
Using cached data for date 1875-03-12 00:00:00
Using cached data for date 1983-03-12 00:00:00
Using cached data for date 1910-03-12 00:00:00
Using cached data for date 1818-03-12 00:00:00
Using default data for date 1890-05-05 00:00:00
Using cached data for date 1908-03-12 00:00:00
Using default data for date 1896-03-12 00:00:00
Using cached data for date 1946-06-28 00:00:00
Using default data for date 1897-11-08 00:00:00
Using cached data for date 1818-03-12 00:00:00
Using default data for date 1800-01-22 00:00:00
Using default data for date 1753-03-12 00:00:00
Using cached data for date 1908-03-12 00:00:00
Using cached data for date 1908-03-12 00:00:00
Using



Using default data for date 1759-03-12 00:00:00
Using cached data for date 1683-03-12 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using cached data for date 1900-03-12 00:00:00
Using default data for date 1716-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1942-03-17 00:00:00
Using default data for date 1919-10-12 00:00:00
Using cached data for date 2019-03-12 00:00:00
Using cached data for date 1998-03-12 00:00:00
Using cached data for date 1895-03-12 00:00:00
Using cached data for date 1910-03-12 00:00:00
Using cached data for date 1983-03-12 00:00:00
Using default data for date 1744-03-12 00:00:00
Using default data for date 1748-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1771-03-12 00:00:00
Using default data for date 1732-05-18 00:00:00
Using default data for date 1979-03-12 00:00:00
Using default data for date 1854-03-12 00:00:00
Using cached data for date 1981-03-12 00:00:00
Usi

  7%|▋         | 722/10992 [00:03<00:43, 235.97it/s]

Using cached data for date 1772-03-12 00:00:00
Using cached data for date 1812-03-12 00:00:00
Using cached data for date 1929-05-30 00:00:00
Using cached data for date 1998-03-12 00:00:00
Using cached data for date 1856-11-01 00:00:00
Using cached data for date 1908-03-12 00:00:00
Using cached data for date 1856-11-01 00:00:00
Using cached data for date 1812-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1758-03-12 00:00:00
Using cached data for date 1800-03-12 00:00:00
Using cached data for date 1896-03-12 00:00:00
Using cached data for date 1886-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1791-03-12 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using default data for date 1937-05-21 00:00:00
Using default data for date 2001-09-01 00:00:00
Using cached data for date 1979-03-12 00:00:00
Using cached data for date 1828-03-12 00:00:00
Using defau

  9%|▊         | 939/10992 [00:03<00:25, 390.36it/s]

Using default data for date 1901-05-23 00:00:00
Using default data for date 1872-03-12 00:00:00
Using default data for date 1686-03-12 00:00:00
Using default data for date 1956-04-14 00:00:00
Using cached data for date 1946-06-28 00:00:00
Using cached data for date 1683-03-12 00:00:00
Using cached data for date 1683-03-12 00:00:00
Using cached data for date 1939-03-12 00:00:00
Using default data for date 1881-07-30 00:00:00
Using default data for date 2010-09-23 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using cached data for date 1832-03-12 00:00:00
Using cached data for date 1901-03-12 00:00:00
Using cached data for date 1984-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1865-03-12 00:00:00
Using cached data for date 1862-03-12 00:00:00
Using cached data for date 2025-03-01 00:00:00
Using cached data for date 2025-03-19 00:00:00
Using default data for date 1865-04-15 00:00:00
Using

 10%|▉         | 1063/10992 [00:03<00:26, 368.23it/s]

Using default data for date 1864-12-06 00:00:00
Using cached data for date 1975-03-12 00:00:00
Using cached data for date 1790-03-12 00:00:00
Using cached data for date 1790-03-12 00:00:00
Using default data for date 1849-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1895-03-12 00:00:00
Using cached data for date 1875-03-12 00:00:00
Using default data for date 1921-03-12 00:00:00
Using cached data for date 1875-03-12 00:00:00
Using cached data for date 1875-03-12 00:00:00
Using cached data for date 1875-03-12 00:00:00
Using cached data for date 1875-03-12 00:00:00
Using cached data for date 1895-03-12 00:00:00
Using cached data for date 2019-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1841-03-12 00:00:00
Using cached data for date 1983-03-12 00:00:00
Using cached data for date 1983-03-12 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cach

 12%|█▏        | 1345/10992 [00:04<00:16, 596.84it/s]

Using default data for date 2000-12-05 00:00:00
Using cached data for date 1932-03-12 00:00:00
Using cached data for date 1700-03-12 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using default data for date 1874-03-12 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1996-03-12 00:00:00
Using cached data for date 1977-03-12 00:00:00
Using cached data for date 1874-03-12 00:00:00
Using cached data for date 1894-03-12 00:00:00
Using default data for date 1995-03-12 00:00:00
Using cached data for date 1769-03-12 00:00:00
Using cached data for date 1910-03-12 00:00:00
Using default data for date 1973-03-24 00:00:00
Using default data for date 1919-03-26 00:00:00
Using cached data for date 1736-03-12 00:00:00
Using cached data for date 1906-03-12 00:00:00
Using cached data for date 1906-03-12 00:00:00
Using cached data for date 1906-03-12 00:00:00
Using cached data for date 1906-03-12 00:00:00
Using ca

 13%|█▎        | 1474/10992 [00:04<00:16, 591.13it/s]

Using default data for date 1857-02-01 00:00:00
Using cached data for date 1850-03-12 00:00:00
Using cached data for date 1700-03-12 00:00:00
Using cached data for date 1847-03-12 00:00:00
Using default data for date 1934-11-12 00:00:00
Using default data for date 1904-09-01 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using default data for date 1794-03-12 00:00:00
Using default data for date 1886-04-12 00:00:00
Using cached data for date 2008-03-12 00:00:00
Using cached data for date 1911-03-12 00:00:00
Using default data for date 1797-10-07 00:00:00
Using cached data for date 1797-10-07 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1860-07-19 00:00:00
Using default data for date 1733-03-12 00:00:00
Using cached data for date 1733-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1893-03-12 00:00:00
Using default data for date 1740-09-12 00:00:00
Using default data for date 1954-01-20 00:00:00
Usin



Using cached data for date 1846-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1965-03-12 00:00:00
Using cached data for date 1871-03-12 00:00:00
Using cached data for date 1871-03-12 00:00:00
Using default data for date 1751-03-12 00:00:00
Using default data for date 1904-11-12 00:00:00
Using cached data for date 1910-03-12 00:00:00
Using default data for date 1843-11-27 00:00:00
Using cached data for date 1846-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-03-17 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1749-03-12 00:00:00
Using cached data for date 1749-03-12 00:00:00
Using default data for date 1936-12-08 00:00:00
Using default data for date 1925-03-25 00:00:00
Using cached data for date 1679-03-12 00:00:00
Using cached data for date 1975-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using ca

 15%|█▍        | 1643/10992 [00:04<00:21, 437.57it/s]

Using default data for date 2006-03-12 00:00:00
Using default data for date 1959-12-14 00:00:00
Using default data for date 1876-03-12 00:00:00
Using cached data for date 1858-03-12 00:00:00
Using cached data for date 1837-03-12 00:00:00
Using default data for date 1830-04-22 00:00:00
Using default data for date 1879-03-12 00:00:00
Using cached data for date 1966-03-12 00:00:00
Using cached data for date 1905-08-03 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1859-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1894-03-12 00:00:00
Using default data for date 1897-03-12 00:00:00
Using cached data for date 1910-03-12 00:00:00
Using cached data for date 1910-03-12 00:00:00
Using cached data for date 1980-03-12 00:00:00
Using cached data for date 1820-03-12 00:00:00
Using default data for date 2009-06-17 00:00:00
Using cached data for date 1978-03-12 00:00:00
Using cached data for date 1978-03-12 00:00:00
Using

 17%|█▋        | 1875/10992 [00:05<00:14, 636.19it/s]

Using default data for date 2001-10-24 00:00:00
Using cached data for date 1864-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1954-03-12 00:00:00
Using cached data for date 1953-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1911-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1724-03-12 00:00:00
Using cached data for date 1953-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1823-10-05 00:00:00
Using cached data for date 1895-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1887-03-12 00:00:00
Using default data for date 1958-08-29 00:00:00
Using cached data for date 1975-03-12 00:00:00
Using cached data for date 1885-03-12 00:00:00
Using cached data for date 1938-03-12 00:00:00
Using cached data for date 1859-03-12 00:00:00
Using cached data for date 1800-03-12 00:00:00
Using defa



Using default data for date 1970-07-01 00:00:00
Using cached data for date 1872-01-23 00:00:00
Using cached data for date 2025-03-01 00:00:00
Using default data for date 2007-10-12 00:00:00
Using default data for date 1937-10-05 00:00:00
Using cached data for date 1882-03-12 00:00:00
Using cached data for date 1951-11-20 00:00:00
Using cached data for date 1882-03-12 00:00:00
Using cached data for date 1924-03-12 00:00:00
Using cached data for date 1988-03-12 00:00:00
Using cached data for date 2025-03-19 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1970-12-14 00:00:00
Using cached data for date 1807-03-12 00:00:00
Using cached data for date 1837-03-12 00:00:00
Using cached data for date 1970-03-12 00:00:00
Using cached data for date 1800-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1909-03-12 00:00:00
Using cac



Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1927-03-12 00:00:00
Using cached data for date 1859-03-12 00:00:00
Using default data for date 1776-07-04 00:00:00
Using default data for date 2003-11-17 00:00:00
Using cached data for date 1807-03-12 00:00:00
Using default data for date 2008-05-05 00:00:00
Using cached data for date 1969-03-12 00:00:00
Using cached data for date 1805-03-12 00:00:00
Using default data for date 1903-12-26 00:00:00
Using cached data for date 2009-07-19 00:00:00
Using cached data for date 2020-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1816-03-12 00:00:00
Using cached data for date 1882-03-12 00:00:00
Using cached data for date 2008-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1947-07-15 00:00:00
Using ca

 21%|██        | 2304/10992 [00:05<00:16, 517.35it/s]

Using cached data for date 2025-05-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1752-03-12 00:00:00
Using cached data for date 1752-03-12 00:00:00
Using cached data for date 1947-01-08 00:00:00
Using default data for date 1979-11-12 00:00:00
Using cached data for date 1904-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1950-03-12 00:00:00
Using cached data for date 1893-03-12 00:00:00
Using cached data for date 1893-03-12 00:00:00
Using cached data for date 1893-03-12 00:00:00
Using cached data for date 1976-07-15 00:00:00
Using cached data for date 1870-03-12 00:00:00
Using cached data for date 1857-03-12 00:00:00
Using cached data for date 1850-03-12 00:00:00
Using cached data for date 1862-03-12 00:00:00
Using default data for date 1941-03-14 00:00:00
Using cached data for date 2016-03-12 00:00:00
Using cached data for date 1863-03-12 00:00:00
Using cached data for date 1991-03-12 00:00:00
Using cache

 22%|██▏       | 2468/10992 [00:06<00:17, 487.47it/s]

Using default data for date 1882-09-15 00:00:00
Using cached data for date 2000-03-12 00:00:00
Using cached data for date 1872-03-12 00:00:00
Using cached data for date 1893-03-12 00:00:00
Using cached data for date 2000-03-12 00:00:00
Using cached data for date 1912-03-12 00:00:00
Using cached data for date 2011-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1926-03-12 00:00:00
Using default data for date 1981-05-02 00:00:00
Using cached data for date 1906-07-29 00:00:00
Using cached data for date 1914-03-12 00:00:00
Using cached data for date 1914-03-12 00:00:00
Using default data for date 1899-12-12 00:00:00
Using cached data for date 1680-03-12 00:00:00
Using cached data for date 1881-03-12 00:00:00
Using cached data for date 2025-03-01 00:00:00
Using cached data for date 1736-03-12 00:00:00
Using default data for date 1999-07-26 00:00:00
Using cached data for date 1950-03-12 00:00:00
Using cached data for date 1951-03-12 00:00:00
Using cac

 25%|██▌       | 2796/10992 [00:06<00:12, 633.89it/s]

Using cached data for date 1911-03-12 00:00:00
Using cached data for date 1781-09-04 00:00:00
Using cached data for date 1790-12-05 00:00:00
Using cached data for date 1839-03-12 00:00:00
Using cached data for date 1975-03-12 00:00:00
Using cached data for date 1975-03-12 00:00:00
Using cached data for date 1905-03-12 00:00:00
Using cached data for date 1905-03-12 00:00:00
Using cached data for date 1771-03-12 00:00:00
Using default data for date 1825-07-15 00:00:00
Using cached data for date 1870-03-12 00:00:00
Using cached data for date 1829-05-18 00:00:00
Using cached data for date 1951-03-12 00:00:00
Using cached data for date 1872-03-12 00:00:00
Using cached data for date 1975-03-12 00:00:00
Using cached data for date 1905-03-12 00:00:00
Using cached data for date 1975-03-12 00:00:00
Using cached data for date 1975-03-12 00:00:00
Using cached data for date 1882-03-12 00:00:00
Using cached data for date 1882-03-12 00:00:00
Using cached data for date 1962-11-01 00:00:00
Using cached

 29%|██▉       | 3231/10992 [00:07<00:09, 841.12it/s]

Using cached data for date 1785-03-12 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using cached data for date 1812-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1907-03-12 00:00:00
Using cached data for date 1888-03-12 00:00:00
Using cached data for date 1884-03-12 00:00:00
Using cached data for date 1937-10-05 00:00:00
Using default data for date 1854-08-12 00:00:00
Using cached data for date 1839-03-12 00:00:00
Using cached data for date 1839-03-12 00:00:00
Using cached data for date 1819-03-12 00:00:00
Using cached data for date 1905-03-12 00:00:00
Using cached data for date 1857-03-12 00:00:00
Using cached data for date 1872-01-23 00:00:00
Using cached data for date 1934-11-12 00:00:00
Using cached data for date 1948-03-12 00:00:00
Using cached data for date 1800-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1800-03-12 00:00:00
Using cached data for date 2019-03-12 00:00:00
Using cached

 32%|███▏      | 3543/10992 [00:07<00:08, 862.44it/s]

Using default data for date 1718-03-12 00:00:00
Using cached data for date 1754-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1716-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1771-03-12 00:00:00
Using cached data for date 1980-03-12 00:00:00
Using default data for date 1857-02-12 00:00:00
Using cached data for date 2008-03-12 00:00:00
Using cached data for date 1998-03-12 00:00:00
Using default data for date 1985-09-29 00:00:00
Using cached data for date 2007-10-12 00:00:00
Using cached data for date 2012-10-12 00:00:00
Using cached data for date 1892-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1718-03-12 00:00:00
Using cached data for date 1875-03-12 00:00:00
Using cached data for date 1951-03-12 00:00:00
Using cached data for date 1882-03-12 00:00:00
Using defa

 35%|███▍      | 3794/10992 [00:07<00:09, 759.33it/s]

Using default data for date 1952-01-05 00:00:00
Using cached data for date 1952-01-05 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using cached data for date 1835-03-12 00:00:00
Using cached data for date 1974-03-12 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using cached data for date 1854-03-12 00:00:00
Using cached data for date 1892-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1981-03-12 00:00:00
Using cached data for date 1819-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1819-12-13 00:00:00
Using cached data for date 1791-12-15 00:00:00
Using cached data for date 1872-03-12 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using cached data for date 1953-03-12 00:00:00
Using cached data for date 1807-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cache

 36%|███▋      | 3990/10992 [00:08<00:13, 519.91it/s]

Using cached data for date 1855-03-12 00:00:00
Using cached data for date 1778-03-12 00:00:00
Using cached data for date 1778-03-12 00:00:00
Using cached data for date 1937-03-12 00:00:00
Using cached data for date 1778-03-12 00:00:00
Using cached data for date 1772-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1872-01-23 00:00:00
Using cached data for date 1979-03-12 00:00:00
Using default data for date 1775-11-21 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 2003-06-02 00:00:00
Using cached data for date 1800-03-12 00:00:00
Using cached data for date 1852-03-12 00:00:00
Using cached data for date 1736-03-12 00:00:00
Using cached data for date 1778-03-12 00:00:00
Using cached data for date 1778-03-12 00:00:00
Using cached data for date 1885-03-12 00:00:00
Using cached data for date 2008-03-12 00:00:00
Using cached data for date 1858-03-12 00:00:00
Using default data for date 1857-11-21 00:00:00
Using cach

 45%|████▌     | 4965/10992 [00:08<00:05, 1078.75it/s]

Using cached data for date 1796-03-12 00:00:00
Using cached data for date 1980-03-12 00:00:00
Using cached data for date 1778-03-12 00:00:00
Using cached data for date 2025-03-12 00:00:00
Using cached data for date 1778-03-12 00:00:00
Using cached data for date 1841-03-12 00:00:00
Using cached data for date 1778-03-12 00:00:00
Using cached data for date 2008-03-12 00:00:00
Using cached data for date 1734-03-12 00:00:00
Using cached data for date 1942-01-17 00:00:00
Using cached data for date 1796-03-12 00:00:00
Using cached data for date 1778-03-12 00:00:00
Using cached data for date 1797-03-12 00:00:00
Using cached data for date 2025-03-01 00:00:00
Using cached data for date 1798-03-12 00:00:00
Using cached data for date 1797-03-12 00:00:00
Using cached data for date 1791-03-12 00:00:00
Using cached data for date 1830-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1958-05-12 00:00:00
Using cached

 46%|████▌     | 5068/10992 [00:09<00:06, 869.71it/s] 

Using cached data for date 1978-03-12 00:00:00
Using cached data for date 1736-03-12 00:00:00
Using cached data for date 1773-02-09 00:00:00
Using cached data for date 2002-03-12 00:00:00
Using cached data for date 1812-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1911-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-03-01 00:00:00
Using cached data for date 2017-02-12 00:00:00
Using cached data for date 1897-03-12 00:00:00
Using default data for date 1915-12-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2006-03-12 00:00:00
Using cached data for date 1896-08-12 00:00:00
Using cached data for date 1988-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2008-03-12 00:00:00
Using cached data for date 1736-03-12 00:00:00
Using cached data for date 1985-03-12 00:00:00
Using cached

 49%|████▉     | 5363/10992 [00:09<00:06, 884.38it/s]

Using cached data for date 2025-01-01 00:00:00
Using default data for date 1914-03-13 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1833-03-12 00:00:00
Using cached data for date 1935-03-12 00:00:00
Using default data for date 1910-11-12 00:00:00
Using cached data for date 1919-03-12 00:00:00
Using cached data for date 1955-03-12 00:00:00
Using cached data for date 1943-03-12 00:00:00
Using cached data for date 1983-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1937-03-12 00:00:00
Using cached data for date 1900-03-12 00:00:00
Using default data for date 1932-04-04 00:00:00
Using cached data for date 2021-11-12 00:00:00
Using cached data for date 1781-03-12 00:00:00
Using cached data for date 1975-03-12 00:00:00
Using default data for date 1914-05-13 00:00:00
Using cached data for date 2008-03-12 00:00:00
Using cached data for date 1985-03-12 00:00:00
Using cached data for date 1980-03-12 00:00:00
Using cac

 51%|█████     | 5627/10992 [00:09<00:06, 797.53it/s]

Using cached data for date 1901-03-12 00:00:00
Using cached data for date 1805-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1910-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1948-03-12 00:00:00
Using cached data for date 1852-03-12 00:00:00
Using cached data for date 1801-03-12 00:00:00
Using cached data for date 1836-03-12 00:00:00
Using cached data for date 2005-11-12 00:00:00
Using default data for date 1869-09-06 00:00:00
Using cached data for date 1832-03-12 00:00:00
Using cached data for date 1900-03-12 00:00:00
Using cached data for date 1882-03-12 00:00:00
Using default data for date 1869-07-15 00:00:00
Using default data for date 1956-06-10 00:00:00
Using cached data for date 2005-11-12 00:00:00
Using default data for date 2010-03-04 00:00:00
Using cached data for date 1914-03-12 00:00:00
Using cached data for date 1678-03-12 00:00:00
Using def

 52%|█████▏    | 5745/10992 [00:10<00:06, 749.64it/s]

Using cached data for date 2012-10-12 00:00:00Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1911-03-12 00:00:00
Using cached data for date 1988-03-12 00:00:00
Using cached data for date 1818-03-12 00:00:00
Using cached data for date 1774-03-12 00:00:00
Using cached data for date 1858-03-12 00:00:00
Using cached data for date 1910-03-12 00:00:00
Using cached data for date 1851-03-12 00:00:00
Using cached data for date 1754-03-12 00:00:00
Using cached data for date 1870-03-12 00:00:00
Using cached data for date 1800-03-12 00:00:00
Using cached data for date 1904-03-12 00:00:00
Using cached data for date 1800-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1910-03-12 00:00:00
Using cached data for date 1990-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1910-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached d

 62%|██████▏   | 6770/10992 [00:10<00:02, 1734.81it/s]

Using default data for date 1922-02-14 00:00:00
Using cached data for date 1812-03-12 00:00:00
Using cached data for date 1885-03-12 00:00:00
Using cached data for date 1812-03-12 00:00:00
Using default data for date 1912-06-18 00:00:00
Using cached data for date 1925-03-12 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using default data for date 1860-12-18 00:00:00
Using cached data for date 2025-03-12 00:00:00
Using cached data for date 2008-03-12 00:00:00
Using cached data for date 1763-03-12 00:00:00
Using cached data for date 1828-03-12 00:00:00
Using cached data for date 1913-05-13 00:00:00
Using cached data for date 1898-03-12 00:00:00
Using cached data for date 1774-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1920-03-12 00:00:00
Using cached data for date 2010-08-07 00:00:00
Using cached data for date 1788-03-12 00:00:00
Using cached data for date 1996-03-12 00:00:00
Using cach

 71%|███████   | 7759/10992 [00:10<00:01, 2189.17it/s]

Using cached data for date 1797-03-12 00:00:00
Using cached data for date 1966-03-12 00:00:00
Using cached data for date 1752-03-12 00:00:00
Using cached data for date 1933-11-12 00:00:00
Using cached data for date 2025-03-01 00:00:00
Using cached data for date 1971-07-22 00:00:00
Using cached data for date 1736-03-12 00:00:00
Using cached data for date 1700-03-12 00:00:00
Using cached data for date 1781-08-14 00:00:00
Using cached data for date 1758-03-12 00:00:00
Using cached data for date 2004-03-12 00:00:00
Using cached data for date 1700-03-15 00:00:00
Using cached data for date 1964-03-12 00:00:00
Using cached data for date 1910-03-12 00:00:00
Using default data for date 1815-02-19 00:00:00
Using cached data for date 1860-03-12 00:00:00
Using cached data for date 1968-03-12 00:00:00
Using cached data for date 1908-03-12 00:00:00
Using default data for date 1862-12-17 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cache

 75%|███████▍  | 8209/10992 [00:11<00:01, 1784.81it/s]

Using cached data for date 1812-03-12 00:00:00
Using cached data for date 1906-03-12 00:00:00
Using cached data for date 1764-03-12 00:00:00
Using default data for date 1818-12-15 00:00:00
Using cached data for date 1920-03-12 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using cached data for date 1896-03-12 00:00:00
Using cached data for date 1965-09-15 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1752-03-12 00:00:00
Using cached data for date 1896-03-12 00:00:00
Using cached data for date 1910-03-12 00:00:00
Using cached data for date 1897-11-08 00:00:00
Using cached data for date 2008-03-12 00:00:00
Using cached data for date 2025-03-06 00:00:00
Using cached data for date 1815-03-12 00:00:00
Using cached data for date 1818-12-15 00:00:00
Using cached data for date 1812-03-12 00:00:00
Using cached data for date 1847-03-12 00:00:00
Using default data for date 1981-05-09 00:00:00
Using cached data for date 1900-03-12 00:00:00
Using cache

 76%|███████▋  | 8389/10992 [00:11<00:01, 1535.06it/s]

Using cached data for date 1776-03-12 00:00:00
Using cached data for date 1776-03-12 00:00:00
Using cached data for date 2025-03-12 00:00:00
Using cached data for date 1988-03-12 00:00:00
Using cached data for date 1988-03-12 00:00:00
Using cached data for date 1776-03-12 00:00:00
Using default data for date 1971-11-26 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1988-05-27 00:00:00
Using cached data for date 1801-03-12 00:00:00
Using cached data for date 2002-03-12 00:00:00
Using cached data for date 1975-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1856-03-12 00:00:00
Using cached data for date 1856-03-12 00:00:00
Using cached data for date 1721-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1857-03-12 00:00:00
Using cached data for date 1955-03-12 00:00:00
Using cached data for date 1841-05-24 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached

 82%|████████▏ | 8971/10992 [00:11<00:01, 1960.68it/s]

Using cached data for date 1864-03-12 00:00:00
Using cached data for date 2010-03-12 00:00:00
Using cached data for date 1882-03-12 00:00:00
Using default data for date 1926-10-11 00:00:00
Using cached data for date 1800-03-12 00:00:00
Using cached data for date 1959-03-12 00:00:00
Using default data for date 1918-10-23 00:00:00
Using cached data for date 1939-03-12 00:00:00
Using cached data for date 2025-03-19 00:00:00
Using cached data for date 2009-07-19 00:00:00
Using default data for date 1977-11-12 00:00:00
Using cached data for date 1896-03-12 00:00:00
Using cached data for date 2023-03-12 00:00:00
Using default data for date 1965-03-24 00:00:00
Using cached data for date 1968-03-12 00:00:00
Using cached data for date 1856-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1988-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1977-08-12 00:00:00
Using default data for date 1937-09-27 00:00:00
Using c

 85%|████████▍ | 9329/10992 [00:12<00:01, 1431.18it/s]

Using default data for date 1901-05-30 00:00:00
Using cached data for date 2025-03-19 00:00:00
Using cached data for date 1872-01-23 00:00:00
Using cached data for date 1725-03-12 00:00:00
Using cached data for date 2012-10-12 00:00:00
Using cached data for date 2019-03-12 00:00:00
Using default data for date 1990-11-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1988-03-12 00:00:00
Using cached data for date 1977-03-12 00:00:00
Using cached data for date 2020-03-12 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using cached data for date 1996-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1854-03-12 00:00:00
Using default data for date 1900-08-22 00:00:00
Using cached data for date 1900-08-22 00:00:00
Using cached data for date 1900-08-22 00:00:00
Using cached data for date 1765-03-12 00:00:00
Using cached data for date 1799-03-12 00:00:00
Using cached data for date 1799-03-12 00:00:00
Using cach

 88%|████████▊ | 9639/10992 [00:12<00:01, 1125.39it/s]

Using cached data for date 1924-03-12 00:00:00
Using cached data for date 1780-03-12 00:00:00
Using cached data for date 1885-03-12 00:00:00
Using cached data for date 1854-03-12 00:00:00
Using cached data for date 1868-03-12 00:00:00
Using cached data for date 1868-03-12 00:00:00
Using cached data for date 2008-03-12 00:00:00
Using cached data for date 1889-03-12 00:00:00
Using cached data for date 1849-07-23 00:00:00
Using cached data for date 1812-03-12 00:00:00
Using cached data for date 1948-03-12 00:00:00
Using cached data for date 1849-07-23 00:00:00
Using default data for date 1876-08-12 00:00:00
Using cached data for date 1988-03-12 00:00:00
Using cached data for date 2025-03-01 00:00:00
Using cached data for date 1948-03-12 00:00:00
Using default data for date 1975-06-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1828-03-12 00:00:00
Using cached data for date 1872-01-23 00:00:00
Using cached data for date 1977-03-12 00:00:00
Using cache



Using default data for date 1684-12-03 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1910-03-12 00:00:00
Using default data for date 1897-11-12 00:00:00
Using cached data for date 1786-03-12 00:00:00
Using default data for date 1780-09-18 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1692-03-12 00:00:00
Using cached data for date 1983-03-12 00:00:00
Using cached data for date 1924-03-12 00:00:00
Using cached data for date 1753-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1771-03-12 00:00:00
Using cached data for date 1917-08-14 00:00:00
Using cached data for date 1750-03-12 00:00:00
Using cached data for date 1849-03-12 00:00:00
Using cached data for date 2008-03-12 00:00:00
Using cached data for date 1763-03-12 00:00:00
Using cached data for date 2003-03-12 00:00:00


 94%|█████████▎| 10301/10992 [00:12<00:00, 1188.21it/s]

Using cached data for date 2017-02-12 00:00:00
Using cached data for date 1757-03-12 00:00:00
Using cached data for date 1950-03-12 00:00:00
Using cached data for date 1796-03-12 00:00:00
Using cached data for date 1993-01-21 00:00:00
Using cached data for date 1968-03-12 00:00:00
Using cached data for date 1909-03-12 00:00:00
Using cached data for date 1924-03-12 00:00:00
Using cached data for date 1902-03-12 00:00:00
Using cached data for date 1910-03-12 00:00:00
Using cached data for date 1926-03-12 00:00:00
Using cached data for date 1842-03-12 00:00:00
Using cached data for date 1798-03-12 00:00:00
Using cached data for date 2017-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using default data for date 1895-10-04 00:00:00
Using cached data for date 1701-03-12 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using cached data for date 1924-03-12 00:00:00
Using cached data for date 1754-03-12 00:00:00
Using cached data for date 1898-03-12 00:00:00
Using cached

 95%|█████████▍| 10412/10992 [00:13<00:00, 799.03it/s] 

Using cached data for date 1692-03-12 00:00:00
Using default data for date 1950-01-01 00:00:00
Using cached data for date 1781-03-12 00:00:00
Using cached data for date 2002-03-12 00:00:00
Using default data for date 1958-09-12 00:00:00
Using cached data for date 1827-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1953-03-12 00:00:00
Using cached data for date 1785-03-12 00:00:00
Using cached data for date 1993-03-12 00:00:00
Using cached data for date 1862-03-12 00:00:00
Using cached data for date 1739-03-12 00:00:00
Using cached data for date 1708-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1895-03-12 00:00:00
Using cached data for date 2025-03-21 00:00:00
Using cached data for date 1680-03-12 00:00:00
Using cached data for date 1989-03-12 00:00:00
Using cached data for date 1944-03-12 00:00:00
Using cached data for date 2025-01-01 00:00:00
Using cached data for date 1998-03-12 00:00:00
Using cache

100%|██████████| 10992/10992 [00:13<00:00, 806.04it/s] 


                city        country  \
0                Ada  United States   
1            Addison  United States   
2             Adrian  United States   
3             Adrian  United States   
4             Albion  United States   
...              ...            ...   
10987    Westminster  United States   
10988    Westminster  United States   
10989    Wheat Ridge  United States   
10990    Wheat Ridge  United States   
10991  Woodland Park  United States   

                                             description  \
0      Ada witch - Sometimes you can see a misty blue...   
1      A little girl was killed suddenly while waitin...   
2      If you take Gorman Rd. west towards Sand Creek...   
3      In the 1970's, one room, room 211, in the old ...   
4      Kappa Delta Sorority - The Kappa Delta Sororit...   
...                                                  ...   
10987  at 12 midnight you can see a lady with two lit...   
10988  Is haunted by the victims of a murder that h

In [None]:
# Save Haunted Places DataFrame as CSV (for tika-similarity task)
df.to_csv('haunted_places_complete.csv')

NameError: name 'df' is not defined

**Task #6 & Task #7: Installing and running Tika-Similarity, and Combining all new JSONs with additional features into a single TSV file**

In [None]:
!pip install csvkit==0.9.2

Collecting csvkit==0.9.2
  Downloading csvkit-0.9.2.tar.gz (3.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting openpyxl<2.3,>=2.2.0 (from csvkit==0.9.2)
  Downloading openpyxl-2.2.6.tar.gz (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.0/109.0 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-dateutil==2.2 (from csvkit==0.9.2)
  Downloading python-dateutil-2.2.tar.gz (259 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.1/259.1 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jdcal (from openpyxl<2.3,>=2.2.0->csvkit==0.9.2)
  Downloading jdcal-1.4.1-py2.py3-none-any.whl.metadata (5.0 kB)
Downloading jdcal-1.4.1-py2.py3-none-any.whl (9.5 kB)
Building w

In [None]:
# Convert expanded Haunted Places CSV into TSV file
!csvformat -T haunted_places_complete.csv > haunted_places.tsv

In [None]:
# Setup ETL lib
!apt-get install -y libmagic-dev
!git clone https://github.com/chrismattmann/etllib.git
%cd etllib
!python setup.py install

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  libmagic-dev
0 upgraded, 1 newly installed, 0 to remove and 29 not upgraded.
Need to get 105 kB of archives.
After this operation, 389 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libmagic-dev amd64 1:5.41-3ubuntu0.1 [105 kB]
Fetched 105 kB in 1s (113 kB/s)
Selecting previously unselected package libmagic-dev:amd64.
(Reading database ... 124947 files and directories currently installed.)
Preparing to unpack .../libmagic-dev_1%3a5.41-3ubuntu0.1_amd64.deb ...
Unpacking libmagic-dev:amd64 (1:5.41-3ubuntu0.1) ...
Setting up libmagic-dev:amd64 (1:5.41-3ubuntu0.1) ...
Processing triggers for man-db (2.10.2-1) ...
Cloning into 'etllib'...
remote: Enumerating objects: 510, done.[K
remote: Counting objects: 100% (58/58), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: To

In [None]:
# Make sure ETL lib was installed correctly
import etllib
print("ETL lib installed successfully!")

ETL lib installed successfully!


In [None]:
# Convert Haunted Places TSV into JSON using ETL lib
# requires Haunted Places TSV, colheader.txt, and encoding.txt
!mkdir -p ./json/aggregate-json
!tsvtojson -t ./haunted_places.tsv -j ./json/aggregate-json/aggregate.json -c ./colheaders.txt -o hauntedplaces -e ./encoding.txt -s 0.8 -v

['utf-8']
['city', 'country', 'description', 'location', 'state', 'state_abbrev', 'longitude', 'latitude', 'city_longitude', 'city_latitude', 'Audio Evidence', 'Image/Video/Visual Evidence', 'Haunted Places Date', 'Haunted Places Witness Count', 'Time of Day', 'Apparition Type', 'Ghost Category', 'Event Type', 'Over 18 binge drink at least once per month', 'Alcohol deaths per capita', 'Annual deaths attributable to excessive alcohol use', 'Sunrise', 'Sunset', 'Daylight Duration', 'Daylight Duration 2', 'crime_solved', 'victim_sex', 'weapon', 'Median Age', 'Population', 'Housing Units', 'moon_phase', 'moon_diameter', 'moon_distance']
Deduping list of structs. Count: [10993]
After dedup. Count: [10993]
Near duplicates detection.
Filtered 0 near duplicates.
After near duplicates. Count: [10992]
Writing output file: [json/aggregate-json/aggregate.json]


In [None]:
# Convert 'Haunted Places Date' column of Haunted Places JSON into Solr date
# format ("%Y-%m-%dT%H:%M:%SZ")

import json
from datetime import datetime

# Load your JSON data
with open('etllib/json/aggregate-json/aggregate.json', 'r') as f:
    data = json.load(f)

# List of possible date formats to try
DATE_FORMATS = [
    "%Y-%m-%d %H:%M:%S",  # Format like "1891-03-12 00:00:00"
    "%m/%d/%Y %H:%M",      # Format like "03/12/1954 0:00"
    "%m/%d/%Y",            # Format like "03/12/1954"
    "%d/%m/%Y %H:%M",      # Format like "12/03/1954 0:00"
    "%d/%m/%Y",            # Format like "12/03/1954"
    "%Y-%m-%d",            # Format like "1954-03-12"
]

# Function to convert date to ISO 8601 format with multiple date formats
def convert_date(record):
    if "Haunted Places Date" in record:
        original_date = record["Haunted Places Date"]

        # Check if the date is the string "Haunted Places Date" and set to Jan 1, 2025, Midnight
        if original_date == "Haunted Places Date":
            record["Haunted Places Date"] = "2025-01-01T00:00:00Z"
            return record  # Skip further conversion

        # Check if the date is already in the correct format
        try:
            # Try parsing with ISO 8601 format
            date_obj = datetime.strptime(original_date, "%Y-%m-%dT%H:%M:%SZ")
            return record  # Skip conversion if it's already in the correct format
        except ValueError:
            pass  # Continue with the conversion if it's not in the correct format

        # Try different date formats from the list
        for fmt in DATE_FORMATS:
            try:
                # Try parsing the date with each format
                date_obj = datetime.strptime(original_date, fmt)
                # Convert to ISO 8601 format (YYYY-MM-DDThh:mm:ssZ)
                record["Haunted Places Date"] = date_obj.strftime("%Y-%m-%dT%H:%M:%SZ")
                break  # Exit the loop if the date was successfully parsed
            except ValueError:
                continue  # Try the next date format if parsing fails

        # If no format succeeded, mark it as "Invalid Date"
        if isinstance(record["Haunted Places Date"], str) and not record["Haunted Places Date"].endswith('Z'):
            record["Haunted Places Date"] = "Invalid Date"
            print(f"Error converting date for record {record.get('id', 'Unknown')}: {original_date}")

    return record

# Apply the date conversion to all records in the "hauntedplaces" list
data["hauntedplaces"] = [convert_date(record) for record in data["hauntedplaces"]]

# Save the updated JSON data back to the file
with open('etllib/json/aggregate-json/aggregate.json', 'w') as f:
    json.dump(data, f, indent=4)

# Verify the change by printing the first record's updated date
print(data["hauntedplaces"][0]["Haunted Places Date"])  # Print the first updated date


2002-03-12T00:00:00Z


In [None]:
# Create folder to store repackaged JSON files
%cd ..
!mkdir -p split-json
%cd ./split-json

/content
/content/split-json


In [None]:
# Use ETL lib to repackage JSON files
!repackage -j ./etllib/json/aggregate-json/aggregate.json -o hauntedplaces -v

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Writing json file: [/content/split-json/f98f3364-d28e-4b15-8469-e1ca133cbfec.json]
Writing json file: [/content/split-json/154a6e7d-e04b-4658-9010-554d4a21456b.json]
Writing json file: [/content/split-json/70ed6898-fe40-4433-9dbf-bcfb3ab09157.json]
Writing json file: [/content/split-json/ccb58240-2b03-4ff1-97ae-6e7d1eb40c8e.json]
Writing json file: [/content/split-json/810ea86e-240b-4af5-a6a0-d2a6f5f8b147.json]
Writing json file: [/content/split-json/7590bccb-b6e7-4b89-ab14-93a58bdaf734.json]
Writing json file: [/content/split-json/3143baee-69f9-4cc1-a5cd-66eb5735cb66.json]
Writing json file: [/content/split-json/5d38823d-3a58-41d9-9b59-5ecd2ccf0b28.json]
Writing json file: [/content/split-json/e72b2576-a5df-4a25-b632-ef7adebaab05.json]
Writing json file: [/content/split-json/b41f1db1-7962-41ca-9084-001bf6431d46.json]
Writing json file: [/content/split-json/b1d8c5d6-4b9d-46ab-b17a-8e10d746bf6a.json]
Writing json file: [/c

In [None]:
# Set up tika-similarity
%cd ..
!pip install tika editdistance
!git clone https://github.com/chrismattmann/tika-similarity.git

/content
[33mDEPRECATION: Loading egg at /usr/local/lib/python3.11/dist-packages/hirlite-0.3.1-py3.11-linux-x86_64.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.11/dist-packages/tika-2.6.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.11/dist-packages/etllib-1.1-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.11/dist-packages/pytho

In [None]:
# Warning: This code splits the aggregate JSON file into folders with 100 files
# each. If you are trying to run all of the JSON files, do NOT run this code
import os
import shutil

def split_json_files(input_folder):
    # List all JSON files in the folder
    files = [f for f in os.listdir(input_folder) if f.endswith('.json')]

    # Split files into subfolders with a max of 100 files each
    folder_count = 0
    for i in range(0, len(files), 500):
        folder_count += 1
        new_folder = os.path.join(input_folder, f"folder_{folder_count}")
        os.makedirs(new_folder, exist_ok=True)

        # Move files to the new folder
        for file in files[i:i+500]:
            shutil.move(os.path.join(input_folder, file), os.path.join(new_folder, file))

# Example usage
input_folder = '/content/split-json'
split_json_files(input_folder)


In [None]:
# Create data folder to hold jaccard, editdistance, and cosine clustering files
!mkdir -p data
!mv ./split-json ./data
%cd ./data

/content/data


In [None]:
# Run jaccard_similarity pipeline
!python ./tika-similarity/tikasimilarity/distance/jaccard_similarity.py --inputDir ./data/split-json/folder_1 --outCSV jaccard.csv
!python ./tika-similarity/tikasimilarity/cluster/edit-cosine-circle-packing.py --inputCSV ./data/jaccard.csv --cluster 2
!python ./tika-similarity/tikasimilarity/cluster/edit-cosine-cluster.py --inputCSV ./data/jaccard.csv --cluster 2
!python ./tika-similarity/tikasimilarity/cluster/generateLevelCluster.py
!cp -R ./etllib/html/* .

Accepting all MIME Types.....
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connectionpool.py", line 787, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/http/client.py", line 1395, in getresponse
    response.begin()
  File "/usr/lib/python3.11/http/client.py", line 325, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/http/client.py", line 294, in _read_status
    raise RemoteDisconnected("Remote end closed connection without"
http.client.Rem

In [None]:
# Move jaccard.csv and JSON files into jaccard folder
%cd ./data
!mkdir jaccard
!mv *.json jaccard.csv jaccard

/content/data


In [None]:
# Run editdistance pipeline
%cd ./data
!python ./tika-similarity/tikasimilarity/distance/edit-value-similarity.py --inputDir ./data/split-json --outCSV edit.csv
!python ./tika-similarity/tikasimilarity/cluster/edit-cosine-circle-packing.py --inputCSV edit.csv --cluster 0
!python ./tika-similarity/tikasimilarity/cluster/edit-cosine-cluster.py --inputCSV edit.csv --cluster 2
!python ./tika-similarity/tikasimilarity/cluster/generateLevelCluster.py

/content/data
Accepting all MIME Types.....


In [None]:
# Move edit.csv and JSON files into editdistance folder
%cd ./data
!mkdir editdistance
!mv *.json edit.csv editdistance

/content/data


In [None]:
# Run cosine_similarity pipeline
%cd ./data
!python ./tika-similarity/tikasimilarity/distance/cosine_similarity.py --inputDir ./data/split-json --outCSV cosine.csv
!python ./tika-similarity/tikasimilarity/cluster/edit-cosine-circle-packing.py --inputCSV cosine.csv --cluster 2
!python ./tika-similarity/tikasimilarity/cluster/edit-cosine-cluster.py --inputCSV cosine.csv --cluster 2
!python ./tika-similarity/tikasimilarity/cluster/generateLevelCluster.py

Accepting all MIME Types.....


In [None]:
# Move cosine.csv and JSON files into cosine folder
%cd ./data
!mkdir cosine
!mv *.json cosine.csv cosine

/content/data
mkdir: cannot create directory ‘cosine’: File exists
