<h1 style="text-align: center; color: orange;"><b>TASK 2</b></h1>

<h3 style="text-align: left; color: yellow;"><b>Stock Ticker Extraction</b></h3>

In [12]:
import pandas as pd
import spacy
import re
from rapidfuzz import process
import warnings
warnings.filterwarnings('ignore')

# Define file paths
file_path = r"C:\Users\SHAHBAZ\Desktop\reuters_headlines.csv"
sp500_file = r"C:\Users\SHAHBAZ\Desktop\sp500_companies.csv"

# Load datasets
raw_data = pd.read_csv(file_path)
sp500_data = pd.read_csv(sp500_file)

# Load spaCy model
nlp = spacy.load("en_core_web_sm", exclude=["parser", "lemmatizer", "attribute_ruler"])

# Function to extract and clean company names
def extract_company_names(text):
    """
    Extract and clean company names from text using spaCy's named entity recognition.
    """
    doc = nlp(text)
    company_names = [
        re.sub(r"'s$", "", ent.text.strip())  # Remove possessives
        for ent in doc.ents if ent.label_ == "ORG"
    ]
    return company_names if company_names else None

# Function to clean company names column
def clean_company_names(df, column="Company_Names"):
    """
    Clean the 'Company_Names' column by removing brackets, quotes, and splitting lists.
    """
    df[column] = (
        df[column]
        .astype(str)
        .str.replace(r"[\[\]']", "", regex=True)  # Remove brackets and quotes
        .str.split(",")  # Split by commas
        .str[0]  # Take the first company name
        .str.strip()  # Remove extra spaces
    )
    return df

# Function to map company names to stock tickers
def map_to_stock_tickers(df, sp500):
    """
    Map company names to stock tickers using fuzzy matching.
    """
    # Create a dictionary for mapping
    company_ticker_map = {
        row["Shortname"]: row["Symbol"] for _, row in sp500.iterrows()
    }
    company_ticker_map.update({
        row["Longname"]: row["Symbol"] for _, row in sp500.iterrows()
    })
    
    # Define fuzzy matching function
    def find_best_match(name, choices):
        match, score, _ = process.extractOne(name, choices)
        return match if score > 80 else None
    
    # Apply matching
    df["Matched_Company"] = df["Company_Names"].apply(lambda x: find_best_match(x, company_ticker_map.keys()))
    df["Stock_Ticker"] = df["Matched_Company"].map(company_ticker_map)
    
    # Drop rows without valid tickers
    df = df.dropna(subset=["Stock_Ticker"])
    return df.drop_duplicates(subset=["Stock_Ticker"], keep="first")

# Main processing pipeline
def main_pipeline(raw_data, sp500_data):
    """
    Main pipeline for extracting company names and mapping them to stock tickers.
    """
    # Extract company names
    raw_data["Company_Names"] = raw_data["Description"].astype(str).apply(extract_company_names)
    raw_data = raw_data.dropna(subset=["Company_Names"])  # Drop rows without company names
    
    # Clean company names
    raw_data = clean_company_names(raw_data)
    
    # Map company names to stock tickers
    result_data = map_to_stock_tickers(raw_data, sp500_data)
    
    return result_data

# Execute the pipeline
processed_data = main_pipeline(raw_data, sp500_data)

# Save and display results
output_file = r"C:\Users\SHAHBAZ\Desktop\AVYunique_stock_companies.csv"
processed_data.to_csv(output_file, index=False)
print(processed_data.head())


                                            Headlines         Time  \
1   Disney cuts ad spending on Facebook amid growi...  Jul 18 2020   
7   Evictions nearly back to pre-pandemic levels i...  Jul 17 2020   
8   Google bans ads on coronavirus conspiracy theo...  Jul 17 2020   
10  Delta will avoid pilot furloughs if they agree...  Jul 17 2020   
12  FTC considering deposing top Facebook executiv...  Jul 17 2020   

                                          Description  \
1   Walt Disney  has become the latest company to ...   
7   Landlords in some areas of the United States a...   
8   Alphabet Inc's Google said on Friday it would ...   
10  Delta Air Lines told pilots on Friday it would...   
12  The U.S. Federal Trade Commission is consideri...   

                        Company_Names  \
1                         Walt Disney   
7            the Federal Reserve Bank   
8                        Alphabet Inc   
10                    Delta Air Lines   
12  The U.S. Federal Trade Comm

In [13]:
processed_data.head()

Unnamed: 0,Headlines,Time,Description,Company_Names,Matched_Company,Stock_Ticker
1,Disney cuts ad spending on Facebook amid growi...,Jul 18 2020,Walt Disney has become the latest company to ...,Walt Disney,Walt Disney Company (The),DIS
7,Evictions nearly back to pre-pandemic levels i...,Jul 17 2020,Landlords in some areas of the United States a...,the Federal Reserve Bank,The Bank of New York Mellon Corporation,BK
8,Google bans ads on coronavirus conspiracy theo...,Jul 17 2020,Alphabet Inc's Google said on Friday it would ...,Alphabet Inc,Alphabet Inc.,GOOG
10,Delta will avoid pilot furloughs if they agree...,Jul 17 2020,Delta Air Lines told pilots on Friday it would...,Delta Air Lines,"Air Products and Chemicals, Inc",APD
12,FTC considering deposing top Facebook executiv...,Jul 17 2020,The U.S. Federal Trade Commission is consideri...,The U.S. Federal Trade Commission,The Cigna Group,CI
