# UFO Sightings Analysis
# Webscraping Using All Dataset
## Team Martianas (Angelica, Taryn, Tiffany)



In [1]:
import pandas as pd
import numpy as np
import requests as rs
import matplotlib.pyplot as plt
import seaborn as sns
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup# as soup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# Initialize Chrome webdriver
driver = webdriver.Chrome() 
base_url = "https://nuforc.org/subndx/?id=all"
driver.get(base_url)

# Initialize lists to store headers and data
headers = []
data = []

# Need to Loop through all website pages
while True:
    # Get the current page HTML
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    # Table Data
    table = soup.find('table')
    if not headers:  # Need this because only need the headers once as they are the same
        headers = [th.text.strip() for th in table.find_all('th')]

    # Data rows
    data_rows = table.find_all('tr')[1:]  # This Skips the header row
    for row in data_rows:
        data.append([td.text.strip() for td in row.find_all('td')])  # Include all columns

    # Need to wait for the next button
    next_button = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "table_1_next"))
    )

#     # Check if the next button is disabled
    if 'disabled' in next_button.get_attribute('class'):
        break  # Break loop if next button is disabled

    # Click the next button
    next_button.click()

# Close the webdriver
driver.quit()

# Due to the amount of pages on the website, this cell will take some time to run
# there will be an 'ElementClickInterceptedException' you can bypass and run the next cell

KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(data,columns=headers)
df.head()

In [None]:
df.tail()

In [None]:
num_rows = len(df)
num_rows

In [None]:
# Cleaning- Do not need these columns to answer questions
df = pd.DataFrame(data,columns=headers).drop(columns=['Link','Media', 'Posted'])
df.head()

In [None]:
# Will need to change Occured and reported to date time
datatypes = df.dtypes 
datatypes

In [None]:
# # Reported column shows as obj vs dt; this is why below; date, '', and Y
# Reported = df.get('Reported') 
# Reported_using_get = Reported.tolist()
 # print(Reported_using_get)
    
    

Reported = df['Reported'].value_counts()
print(Reported)


In [None]:
# Occured Column showing some blank info... 
# length 192?

Occurred = df['Occurred'].value_counts()
print(Occurred)

In [None]:
#List of countries

# country_list = df.loc[:, 'Country'].tolist()
# # Show the list
# country_list

country_list = df['Country'].value_counts()
print(country_list)

In [None]:
#List of States

#state_list = df.loc[:, 'State'].tolist()
# Show the list
#state_list

state_list = df['State'].value_counts()
print(state_list)

In [None]:
# List of cities

city_list = df['City'].value_counts()
print(city_list)

In [None]:
# List of Shapes
#df.Shape.value_counts()


shape_list = df['Shape'].value_counts()
print(shape_list)

In [None]:
# Number of rows and columns
df.shape

In [None]:
df.info()

In [None]:
# Split Occured column to Date and Time

df[['Date Occured','Time Occured']]=df.Occurred.str.split(' ',expand=True)
df

In [None]:
# Dropped Occured - Cleaned DataFrame

clean_df = df[['Date Occured', 'Time Occured', 'City', 'State', 'Country', 'Shape', 'Summary' ]]

clean_df.head()

# USA Sightings vs Other Countries

In [None]:
# There are 9 countries in the dataset
num_countries = clean_df['Country'].nunique()
num_countries 

In [None]:
# This is the names of all countries (there is a blank field?)
countries_count = clean_df['Country'].value_counts()
countries_count

In [None]:
# clean_df.groupby('Country').size().sort_values(ascending=False).plot.bar(title='Country Activities');


# Group data by country and calculate the size of each group
country_activity = clean_df.groupby('Country').size().sort_values(ascending=False)

# Bar Plot
plt.figure(figsize=(10, 6)) 
country_activity.plot(kind='bar', color='skyblue')  
plt.title('Country Activities')  
plt.xlabel('Country')  
plt.ylabel('Number of Activities')  
plt.xticks(rotation=45, ha='right')  
plt.grid(axis='y', linestyle='--', alpha=0.7)  

#plt.yticks(range(0, country_activity.max() + 1, 5000))  # Set interval to 50

plt.tight_layout()  
plt.show()

In [None]:
# clean_df.groupby('Country').size().sort_values(ascending=False).plot.bar(title='Country Activities');


# Group data by country and get size of each group
country_activity = clean_df.groupby('Country').size().sort_values(ascending=False)
country_activity



In [None]:
# Bar Plot
plt.figure(figsize=(10, 6)) 
country_activity.plot(kind='bar', color='skyblue')  
plt.title('Country Activities')  
plt.xlabel('Country')  
plt.ylabel('Number of Activities')  
plt.xticks(rotation=45, ha='right')  
plt.grid(axis='y', linestyle='--', alpha=0.7)  
plt.tight_layout()  
plt.show()

# USA States (Highest and Lowest Sightings)

In [None]:
usa_data = clean_df[clean_df['Country'] == 'USA']
usa_data

In [None]:
usa_data = clean_df[clean_df['Country'] == 'USA']

# Create a new DataFrame for just values under Country-USA
usa_df = pd.DataFrame(usa_data)
usa_df

In [None]:
datatypes = df.dtypes 
datatypes

In [None]:
usa_df.info()

In [None]:
usa_df['Date Occured'] = pd.to_datetime(usa_df['Date Occured'])
print(usa_df.dtypes)

In [None]:
usa_df['Time Occured'] = pd.to_datetime(usa_df['Time Occured']).dt.time
print(usa_df.dtypes)

In [None]:
time_values = usa_df['Time Occured'].value_counts()
time_values

In [None]:
state_counts = usa_df['State'].value_counts()
state_counts

## Top 5

In [None]:
top_5_states = state_counts.head(5)
top_5_states

In [None]:
# Top 5 Graph

plt.figure(figsize=(8, 6))
top_5_states.plot(kind='bar', color='blue', alpha=0.7)
plt.title('Number of UFO Sightings - Top 5 States')
plt.xlabel('State')
plt.ylabel('Number of Sightings')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## Top 10

In [None]:
top_10_states = state_counts.head(10)
top_10_states

In [None]:
# Top 10 Graph

plt.figure(figsize=(8, 6))
top_10_states.plot(kind='bar', color='blue', alpha=0.7)
plt.title('Number of UFO Sightings - Top 10 States')
plt.xlabel('State')
plt.ylabel('Number of Sightings')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## Bottom 5

In [None]:
bottom_5_states = state_counts.tail(5)
bottom_5_states

In [None]:
plt.figure(figsize=(8, 6))
bottom_5_states.plot(kind='bar', color='red', alpha=0.7)
plt.title('Number of UFO Sightings - Bottom 5 States')
plt.xlabel('State')
plt.ylabel('Number of Sightings')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## Bottom 10


In [None]:
bottom_10_states = state_counts.tail(10)
bottom_10_states

In [None]:
plt.figure(figsize=(8, 6))
bottom_10_states.plot(kind='bar', color='red', alpha=0.7)
plt.title('Number of UFO Sightings - Bottom 10 States')
plt.xlabel('State')
plt.ylabel('Number of Sightings')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

##  Combined Top 5 and Bottom 5

In [None]:
# Combined
plt.figure(figsize=(10, 6))
top_5_states.plot(kind='bar', color='blue', alpha=0.7, label='Top 5 States')
bottom_5_states.plot(kind='bar', color='red', alpha=0.7, label='Bottom 5 States')
plt.title('Number of UFO Sightings - Top and Bottom 5 States Comparison')
plt.xlabel('State')
plt.ylabel('Number of Sightings')
plt.legend()
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

##  Combined Top 10 and Bottom 10

In [None]:
# Combined
plt.figure(figsize=(10, 6))
top_10_states.plot(kind='bar', color='blue', alpha=0.7, label='Top 10 States')
bottom_10_states.plot(kind='bar', color='red', alpha=0.7, label='Bottom 10 States')
plt.title('Number of UFO Sightings - Top and Bottom 10 States Comparison')
plt.xlabel('State')
plt.ylabel('Number of Sightings')
plt.legend()
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# USA- Shapes

In [None]:
shape_counts = usa_df['Shape'].value_counts()
shape_counts

In [None]:
plt.figure(figsize=(8, 6))
shape_counts.plot(kind='bar', color='green', alpha=0.7)
plt.title('Shape Type Sightings')
plt.xlabel('Shape')
plt.ylabel('Number of Sightings')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Arizona Reports

In [None]:
az_data = usa_df[usa_df['State'] == 'AZ']
az_data

In [None]:
# Plotting
plt.figure(figsize=(10, 6))
plt.hist(az_data['Date Occured'], bins=30, color='yellow', edgecolor='black')
plt.title('Number of UFO Sightings in Arizona (AZ)')
plt.xlabel('Date Occurred')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Only need rows where the 'State' = 'AZ'
az_data = usa_df[usa_df['State'] == 'AZ'].copy()  # Make a copy to avoid the warning

# Get the year from 'Date Occurred' column and make a new column 'Year' 
az_data.loc[:, 'Year'] = az_data['Date Occured'].dt.year

# Need range of years
min_year = az_data['Year'].min()
max_year = az_data['Year'].max()

# number of sightings for each year
year_counts = az_data['Year'].value_counts().sort_index()

# Plotting
plt.figure(figsize=(10, 6))
plt.bar(year_counts.index, year_counts, color='yellow', edgecolor='black')
plt.title('Number of UFO Sightings in Arizona (AZ) by Year')
plt.xlabel('Year')
plt.ylabel('Number of Sightings')

# x-ticks = all years
plt.xticks(range(min_year, max_year + 1), rotation=45)

plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


## Cities/Phoenix

In [None]:
# Count the number of sightings for each city in Arizona
city_counts = az_data['City'].value_counts()

# Sort the cities based on the number of sightings in descending order
sorted_cities = city_counts.sort_values(ascending=False)
print(sorted_cities)

In [None]:
# Plotting
plt.figure(figsize=(12, 6))
sorted_cities.plot(kind='bar', color='orange', edgecolor='black')
plt.title('Number of UFO Sightings in Arizona Cities')
plt.xlabel('City')
plt.ylabel('Number of Sightings')
plt.xticks(rotation=45, ha='right') 
plt.tight_layout()
plt.show()

In [None]:
## Phoenix
phx_data = usa_df[usa_df['City'] == 'Phoenix']
phx_data

## Years

In [None]:
# # Ran all these years and yielded no results
# year_2019 = usa_df[usa_df['Date Occured'].dt.year == 2019]
# year_2020 = usa_df[usa_df['Date Occured'].dt.year == 2020]
# year_2021 = usa_df[usa_df['Date Occured'].dt.year == 2021]
# year_2022 = usa_df[usa_df['Date Occured'].dt.year == 2022]
# year_2023 = usa_df[usa_df['Date Occured'].dt.year == 2023]


In [None]:
year_2024 = usa_df[usa_df['Date Occured'].dt.year == 2024]
year_2024

In [None]:
daily_counts_2024 = year_2024.groupby(year_2024['Date Occured'].dt.date).size()
daily_counts_2024


In [None]:
# Plotting 
plt.figure(figsize=(10, 6))
daily_counts_2024.plot(kind='line', marker='o', color='skyblue')

plt.title('Number of UFO Sightings in 2024')
plt.xlabel('Date')
plt.ylabel('Number of Sightings')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()