# DSCI 614 - Text-Mining
## Project 2
### by Tony Nguyen

## Perform google search by providing query

In [1]:
# import the googlesearch libary
from googlesearch import search
import pandas as pd

# povide the info for the query
query = "Winter snowstorm"

# we need to perform advanced search to get a lsit of RearchResult
# that contian the following three items:
# - title
# - url
# - description

# Get 100 search results in English using advanced search
# googlesearch will send multiple requests to go through the page by pausing about 5 seconds between the requests
results = list(search(query, num_results=100, sleep_interval=5, lang="en",advanced=True))


# Print the cleaned results
# print(results)

## Convert SearchResult objects to a list of strings

We can efficiently perform text mining on strings instead of SearchResult.

In [2]:
results = [str(item) for item in results]

## Convert a list of strings to a data frame

Sometime, we may need to store information in a data frame instead of a list to build some machine learning models.

In [3]:
data =[]
for result in results:
    # Extract URL, title, and description using string manipulation
    url = result.split("url=")[1].split(",")[0]
    title = result.split("title=")[1].split(",")[0]
    description = result.split("description=")[1].split(")")[0]
    
    # Create a dictionary for each SearchResult
    result_dict = {
        'URL': url,
        'Title': title,
        'Description': description
    }
    
    # Append the dictionary to the data list
    data.append(result_dict)

# Create the DataFrame
df = pd.DataFrame(data)

# initial check
print("Before removing anything:")
for index, row in df.iterrows():
    print(f"Original URL: {row['URL']}") 
    print(f"Original Title: {row['Title']}") 
    print(f"Original Description: {row['Description']}") 

Before removing anything:
Original URL: https://www.weather.gov/safety/winter-snow
Original Title: Snow Storm Safety
Original Description: Blizzard: Sustained winds or frequent gusts of 35 mph or more with snow and blowing snow frequently reducing visibility to less than a quarter mile for 3 hours ...
Original URL: https://www.nssl.noaa.gov/education/svrwx101/winter/types/
Original Title: Severe Weather 101: Winter Weather Types
Original Description: A winter storm is a combination of heavy snow, blowing snow and/or dangerous wind chills. A winter storm is life-threatening. Blizzards are dangerous winter ...
Original URL: https://scied.ucar.edu/learning-zone/storms/winter-storms
Original Title: Winter Storms - UCAR Center for Science Education
Original Description: Snowstorms are one type of winter storm. Blizzards are snowstorms with high winds, and lake effect storms are snowstorms that form near the Great Lakes.
Original URL: https://en.wikipedia.org/wiki/Winter_storm
Original Title

## Remove the date and time in the rearch_result using a regular expression

In [4]:
import re
for index, row in df.iterrows():
    '''
        Remove time as the following format
        6 PM
        6 am
        6 p.m.
        6 a.m.
        6:00 PM
        6:00 AM
        12 am
        12 pm
        12:45 AM
        12:45 PM
        12:30 a.m.
        12:34 p.m.
    '''
    removed_time = re.sub(r'\d{1,2}:?\d{0,2}\s((a|A).?(m|M).?|((p|P).?(m|M)).?)', '', row["Description"])
    ''' Remove date as the following format
        Tue February 28th 2023
        May 15
        Dec 28 2022
        February 28, 2022
    '''
    removed_date = re.sub(r'(A-Za-z]*\s)?[A-Za-z]+.?\s\d{1,2}(th)?,?( \d{4})?', '', row["Description"])
    # update Description
    row["Description"] = removed_date 

## Remove the hyperlink URL in the search_result using regular expression

In [5]:
for index, row in df.iterrows():
    # remove URL
    removed_url = re.sub(r'https?://.*', '', row["URL"])
    # update url
    row["URL"] = removed_url


## Remove all words containing at most two characters such as "a", "an", "in", etc

In [6]:
for index, row in df.iterrows():
    
    # remove a string of length 1 or 2 characters from title
    removed_2_chars = re.sub(r'\b[a-zA-Z]{1,2}\b', '', row["Title"]) 
    # update row["Title"]
    row["Title"] = removed_2_chars
    # remove a string of length 1 or 2 characters from description
    removed_2_chars = re.sub(r'\b[a-zA-Z]{1,2}\b', '', row["Description"])
    # update row["Description"]
    row["Description"] = removed_2_chars

## Remove the following five stop words: "are", "but", "very", "since", "could"

In [7]:
for index, row in df.iterrows():
    # remove "are"
    pattern = "are"
    remove_ = re.sub(pattern, '', row["Title"])
    # update title
    row["Title"] = remove_
    remove_ = re.sub(pattern, '', row["Description"])
    # update description
    row["Description"] = remove_
    
    # remove "but"
    pattern = "but"
    remove_ = re.sub(pattern, '', row["Title"])
    # update title
    row["Title"] = remove_
    remove_ = re.sub(pattern, '', row["Description"])
    # update description
    row["Description"] = remove_
    
    # remove "very"
    pattern = "very"
    remove_ = re.sub(pattern, '', row["Title"])
    # update tile
    row["Title"] = remove_
    remove_ = re.sub(pattern, '', row["Description"])
    # update description
    row["Description"] = remove_
    
    # remove "since"
    pattern = "since"
    remove_ = re.sub(pattern, '', row["Title"])
    # update title
    row["Title"] = remove_
    remove_ = re.sub(pattern, '', row["Description"])
    # update desctiption
    row["Description"] = remove_
    
    # remove "could"
    pattern = "could"
    remove_ = re.sub(pattern, '', row["Title"])
    # update title
    row["Title"] = remove_
    remove_ = re.sub(pattern, '', row["Description"])
    # update description
    row["Description"] = remove_

## Remove all special characters, punctuation using regular expression

In [8]:
for index, row in df.iterrows():
    # remove not character and not space from row["Title"] and row["Description"]
    remove_ = re.sub(r'[^\w\s]', '', row["Title"])
    # update title
    row["Title"] = remove_
    remove_ = re.sub(r'[^\w\s]', '', row["Description"])
    # update description
    row["Description"] = remove_
    
# final check
print("After removing all above things:")
for index, row in df.iterrows():
    print(f"New URL: {row['URL']}") 
    print(f"New Title: {row['Title']}") 
    print(f"New Description: {row['Description']}") 

After removing all above things:
New URL: 
New Title: Snow Storm Safety
New Description: Blizzard Sustained winds  frequent gusts  mph  more with snow and blowing snow frequently reducing visibility  less than  quarter mile  hours 
New URL: 
New Title: Severe Weather 101 Winter Weather Types
New Description:  winter storm   combination  heavy snow blowing snow and dangerous wind chills  winter storm  lifethreatening Blizzards  dangerous winter 
New URL: 
New Title: Winter Storms  UCAR Center for Science Education
New Description: Snowstorms  one type  winter storm Blizzards  snowstorms with high winds and lake effect storms  snowstorms that form near the Great Lakes
New URL: 
New Title: Winter storm
New Description:  winter storm   event  which wind coincides with varieties  precipitation that only occur  freezing temperatures such  snow mixed snow and 
New URL: 
New Title: Winter Weather
New Description: Winter storms including blizzards can bring extreme cold freezing rain snow ice a