In [2]:
import pandas as pd
import requests
from pathlib import Path
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.chunk import ne_chunk
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Load the URLs and their corresponding IDs from the csv files
url_df = pd.read_csv(r"C:\Users\vikas\Downloads\assignment\assignment\urls.csv")
id_file_root_path = Path(r"C:\Users\vikas\Downloads\assignment\assignment")
# Define the stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# Loop through each id_df file
for i in range(0, 10):  # Assuming you have 5 id_df files named as "id_1.csv", "id_2.csv", ..., "id_5.csv"
    # Load the IDs from the current id_df file
    try:
        id_df = pd.read_csv(id_file_root_path /  f"{i}.csv")
    
        # Create a dictionary to store the extracted keywords for each URL
        keywords_dict = {}

        # Loop through each URL in the current url_df file
        for j in range(len(url_df)):
            url = "https://www.pwc.com/gx/en/issues/transformation.html"
            id = id_df.iloc[j]["text"]

            # Retrieve the text content of the webpage
            response = requests.get(url)

            soup = BeautifulSoup(response.text, 'html.parser')
            text = soup.get_text()

            # Preprocess the text data
            tokens = word_tokenize(text.lower(),language = "english")
            filtered_tokens = [lemmatizer.lemmatize(w) for w in tokens if not w in stop_words and w.isalpha()]
            tagged_tokens = pos_tag(filtered_tokens)

            # Extract named entities
            named_entities = []
            for chunk in ne_chunk(tagged_tokens):
                if hasattr(chunk, 'label') and chunk.label() == 'NE':
                    named_entities.append(' '.join(c[0] for c in chunk))

            # Extract keywords using TF-IDF
            tfidf_vectorizer = TfidfVectorizer()
            tfidf_matrix = tfidf_vectorizer.fit_transform([' '.join(filtered_tokens)])
            keywords = [word for word, score in sorted(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.toarray()[0]), key=lambda x: x[1], reverse=True)[:10]]

            # Combine the extracted keywords and named entities
            combined_keywords = list(set(keywords + named_entities))
            print(combined_keywords)

            # Add the extracted keywords to the dictionary
            keywords_dict[id] = combined_keywords
            print(keywords_dict[id])

        # Write the extracted keywords to a csv file
        keywords_df = pd.DataFrame.from_dict(keywords_dict, orient='index', columns=range(len(keywords_dict[id])))
        keywords_df.index.name = 'id'
        keywords_df.to_csv(id_file_root_path /  "Output" / f"keywords_{i}.csv")
        print("Done 😊")
    except:
        print("NA :  ", id_file_root_path /  f"{i}.csv")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vikas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc', 'technology', 'value']
['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc', 'technology', 'value']
['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc', 'technology', 'value']
['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc', 'technology', 'value']
['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc', 'technology', 'value']
['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc', 'technology', 'value']
['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc', 'technology', 'value']
['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc', 'technology', 'value']
['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc', 'techn

['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc', 'technology', 'value']
['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc', 'technology', 'value']
['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc', 'technology', 'value']
['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc', 'technology', 'value']
['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc', 'technology', 'value']
['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc', 'technology', 'value']
['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc', 'technology', 'value']
['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc', 'technology', 'value']
Done 😊
['service', 'transformation', 'business', 'issue', 'strategy', 'menu', 'industry', 'pwc',