In [None]:
# web Scrapping from website url present in xlsl file uisng beatifulsoup
!pip install pandas requests beautifulsoup4 openpyxl



In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import openpyxl
from urllib.parse import urlparse

def extract_main_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()

        # Try to find main content
        main_content = None
        potential_content_tags = ['article', 'main', '[role="main"]', '.main-content', '#main-content']

        for tag in potential_content_tags:
            main_content = soup.select_one(tag)
            if main_content:
                break

        if not main_content:
            # If no main content container found, use the body
            main_content = soup.body

        # Remove potential non-article elements
        for elem in main_content.select('header, footer, nav, aside, .sidebar, .comments'):
            elem.decompose()

        # Extract text from remaining paragraphs
        paragraphs = main_content.find_all('p')
        text = ' '.join([p.get_text(strip=True) for p in paragraphs])

        return text.strip()
    except Exception as e:
        return f"Error: {str(e)}"

def process_urls_from_excel(input_file, output_file):
    # Read URLs from Excel file
    df = pd.read_excel(input_file)

    # Ensure there's a 'URL' column and a 'URL_ID' column
    if 'URL' not in df.columns or 'URL_ID' not in df.columns:
        raise ValueError("Excel file must contain both 'URL' and 'URL_ID' columns")

    results = []

    # Process each URL
    for _, row in df.iterrows():
        url_id = row['URL_ID']
        url = row['URL']

        text = extract_main_content(url)
        results.append({'URL_ID': url_id, 'URL': url, 'Text': text})

    # Create a new DataFrame with results
    output_df = pd.DataFrame(results)

    # Write results to a new Excel file
    output_df.to_excel(output_file, index=False)
    print(f"Results written to {output_file}")

# Usage
input_file = 'Input.xlsx'
output_file = 'output.xlsx'
process_urls_from_excel(input_file, output_file)

Results written to output.xlsx


In [None]:
# display Xlsx file in pandas frame
df = pd.read_excel('output.xlsx')
df

Unnamed: 0,URL_ID,URL,Text
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,Client:A leading insurance firm worldwide Indu...
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,Client:A leading fintech firm in the USA Indus...
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,Client:A leading tech firm in the USA Industry...
3,bctech2014,https://insights.blackcoffer.com/effective-man...,Client:A leading tech firm in the USA Industry...
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,Client:A leading fintech firm in the USA Indus...
...,...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...,Client:A leading marketing firm in the USA Ind...
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...,Client:A leading marketing firm in the USA Ind...
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...,Client:A leading healthcare tech firm in the U...
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...,PresentationMapDashboardAPI Integration Kiban...


In [None]:
df['Text'][0]

'Client:A leading insurance firm worldwide Industry Type:BFSI Products & Services:Insurance Organization Size:10000+ The insurance industry, particularly in the context of providing coverage to Public Company Directors against Insider Trading public lawsuits, faces a significant challenge in accurately determining insurance premiums. Traditional methods of premium calculation may lack precision, and there is a growing need for more sophisticated and data-driven approaches. The integration of Artificial Intelligence (AI) and Machine Learning (ML) models in predicting insurance premiums for this specialized coverage is essential to enhance accuracy, fairness, and responsiveness in adapting to evolving risk factors. The problem at hand involves developing robust AI and ML models that can effectively analyze a multitude of dynamic variables influencing the risk profile of Public Company Directors. These variables include market conditions, regulatory changes, historical legal precedents, f

In [None]:
# text analysis
!pip install textblob




In [None]:
# Sentiment analysis
# Cleaning the text using Stopwords and Punction marks
import re
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
# use custom stop words or stopword file
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import os
import chardet

def detect_encoding(file_path):
    with open(file_path, 'rb') as file:
        raw_data = file.read()
    return chardet.detect(raw_data)['encoding']

def load_stopwords(folder_path):
    stopwords = set()

    if not os.path.exists(folder_path):
        print(f"Error: The folder {folder_path} does not exist.")
        return stopwords

    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            try:
                # Detect the file encoding
                encoding = detect_encoding(file_path)

                # If detection fails, try common encodings
                if not encoding:
                    encodings_to_try = ['utf-8', 'iso-8859-1', 'windows-1252']
                else:
                    encodings_to_try = [encoding]

                for enc in encodings_to_try:
                    try:
                        with open(file_path, 'r', encoding=enc) as file:
                            words = file.read().split()
                            stopwords.update(word.strip().lower() for word in words)
                        print(f"Successfully read {filename} with {enc} encoding")
                        break
                    except UnicodeDecodeError:
                        if enc == encodings_to_try[-1]:
                            print(f"Failed to read {filename} with all attempted encodings")
                        continue

            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")

    print(f"Loaded {len(stopwords)} unique stopwords from {folder_path}")
    return stopwords

# Usage
stopwords_folder = '/content/drive/MyDrive/Blackcoffer/StopWords/'  # Replace with the actual path to your stopwords folder
stopwords = load_stopwords(stopwords_folder)

# You can now use the 'stopwords' set in your text analysis

Successfully read StopWords_Currencies.txt with ISO-8859-1 encoding
Successfully read StopWords_GenericLong.txt with ascii encoding
Successfully read StopWords_Names.txt with ascii encoding
Successfully read StopWords_Generic.txt with ascii encoding
Successfully read StopWords_DatesandNumbers.txt with ascii encoding
Successfully read StopWords_Geographic.txt with ascii encoding
Successfully read StopWords_Auditor.txt with ascii encoding
Loaded 12840 unique stopwords from /content/drive/MyDrive/Blackcoffer/StopWords/


In [None]:
print(stopwords)

{'laurena', 'olinda', 'jonie', 'emilee', 'eleonor', 'catheryn', 'milagro', 'sutherland', 'tosha', 'falcon', 'fair', 'dillingham', 'perla', 'bolding', 'mabry', 'following', 'germain', 'silva', 'alysa', 'lee', 'curry', 'cusick', 'nebraska', 'beall', 'tharp', 'deck', 'quincy', 'brewington', 'voigt', 'horacio', 'earlene', 'wen', 'marcie', 'durham', 'forrest', 'louis', 'salgado', 'palermo', 'overall', 'elyse', 'neighbors', 'katerine', 'brant', 'sarmiento', 'schroder', 'mcnabb', 'lexie', 'ermelinda', 'corina', 'webb', 'karp', 'mathilde', 'tressie', 'stitt', 'dann', 'osborne', 'copeland', 'teel', 'ohara', 'humphries', 'lindgren', 'jarrod', 'bunker', 'beatty', 'shafer', 'light', 'gaylor', 'thomasina', 'dixie', 'mirta', 'christian', 'marchand', 'serrano', '.002%.', 'tyrrell', 'poteat', 'kaley', 'ned', 'jimerson', 'georgene', 'briana', 'lizzie', 'coates', 'castro', 'latimer', 'conyers', 'millar', 'lou', 'rebecca', 'verdie', 'clarine', 'coley', 'hitchcock', 'newell', 'janine', 'lail', 'herbert', 

In [None]:
# find particular stopwrod is present or not
'cedi' in stopwords # currency stop word



True

In [None]:
# read the positive words from the positive txt file and store them in positive words
positive_words = set()
with open('/content/drive/MyDrive/Blackcoffer/MasterDictionary/positive-words.txt', 'r', encoding='latin-1') as file: # Try 'latin-1' encoding
    for line in file:
        positive_words.add(line.strip())
# for negative words
negative_words = set()
with open('/content/drive/MyDrive/Blackcoffer/MasterDictionary/negative-words.txt', 'r', encoding='latin-1') as file: # Try 'latin-1' encoding
    for line in file:
        negative_words.add(line.strip())

In [None]:
print(positive_words)


{'authoritative', 'captivating', 'brighten', 'beauteous', 'meaningful', 'reassurance', 'decent', 'fair', 'shimmering', 'remedy', 'enhancement', 'formidable', 'aver', 'refund', 'politeness', 'fancy', 'intuitive', 'awesomely', 'abundant', 'invigorate', 'achievible', 'hands-down', 'bonuses', 'laudably', 'continuity', 'conveniently', 'beautify', 'satisified', 'crisp', 'devout', 'cool', 'benefactor', 'cleaner', 'fervently', 'knowledgeable', 'traction', 'dawn', 'excites', 'blossom', 'pleasurably', 'mesmerizing', 'entice', 'excallent', 'irreplaceable', 'straighten', 'sumptuous', 'unbiased', 'prestigious', 'reputation', 'splendidly', 'hallowed', 'gracefully', 'ilu', 'wonderously', 'improvement', 'felicitate', 'survival', 'trusty', 'alluring', 'wise', 'effectiveness', 'affable', 'gem', 'restful', 'dotingly', 'well-being', 'dependably', 'glisten', 'satisfying', 'user-friendly', 'uncomplicated', 'opulent', 'reliably', 'impressed', 'enthral', 'personalized', 'accolades', 'fresher', 'easing', 'over

In [None]:
'accolade' in positive_words

True

In [None]:
# remove the stopwords from Text in output xlsx
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords]
    return ' '.join(filtered_words)
# pass the Text from column wise to reove stop words



In [None]:
df['Text'] = df['Text'].apply(remove_stopwords)
df

Unnamed: 0,URL_ID,URL,Text
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,Client:A leading insurance firm worldwide Indu...
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,Client:A leading fintech firm Industry Type:Fi...
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,Client:A leading tech firm Industry Type:IT Pr...
3,bctech2014,https://insights.blackcoffer.com/effective-man...,Client:A leading tech firm Industry Type:IT Pr...
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,Client:A leading fintech firm Industry Type:Fi...
...,...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...,Client:A leading marketing firm Industry Type:...
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...,Client:A leading marketing firm Industry Type:...
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...,Client:A leading healthcare tech firm Industry...
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...,PresentationMapDashboardAPI Integration Kibana...


In [None]:
df['Text'][0]

'Client:A leading insurance firm worldwide Industry Type:BFSI Products & Services:Insurance Organization Size:10000+ insurance industry, context providing coverage Public Company Directors Insider Trading public lawsuits, faces significant challenge accurately determining insurance premiums. Traditional methods premium calculation lack precision, growing sophisticated data-driven approaches. integration Artificial Intelligence (AI) Machine Learning (ML) models predicting insurance premiums specialized coverage essential enhance accuracy, fairness, responsiveness adapting evolving risk factors. problem involves developing robust ML models effectively analyze multitude dynamic variables influencing risk profile Public Company Directors. variables include market conditions, regulatory changes, historical legal precedents, financial performance insured company, individual directorial behaviors. goal create predictive model accurately assesses risk potential insider trading public lawsuits 

In [None]:
#  find any Stopword present in df[text][0]
for word in df['Text'][0].split():
    if word.lower() in stopwords:
        print(word)
    else:
        print('no')


no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
n

In [None]:
# postive_word count for Text In df
def count_positive_words(text):
    count = 0
    for word in text.split():
        if word.lower() in positive_words:
            count += 1
    return count
df['Positive_Score'] = df['Text'].apply(count_positive_words)
df

Unnamed: 0,URL_ID,URL,Text,Positive_Score
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,Client:A leading insurance firm worldwide Indu...,30
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,Client:A leading fintech firm Industry Type:Fi...,1
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,Client:A leading tech firm Industry Type:IT Pr...,1
3,bctech2014,https://insights.blackcoffer.com/effective-man...,Client:A leading tech firm Industry Type:IT Pr...,1
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,Client:A leading fintech firm Industry Type:Fi...,1
...,...,...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...,Client:A leading marketing firm Industry Type:...,10
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...,Client:A leading marketing firm Industry Type:...,16
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...,Client:A leading healthcare tech firm Industry...,5
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...,PresentationMapDashboardAPI Integration Kibana...,0


In [None]:
# calculate negative_words
def count_negative_words(text):
    count = 0
    for word in text.split():
        if word.lower() in negative_words:
            count -= 1
    return count
df['Negative_Score'] = df['Text'].apply(count_negative_words)
df


Unnamed: 0,URL_ID,URL,Text,Positive_Score,Negative_Score
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,Client:A leading insurance firm worldwide Indu...,30,-8
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,Client:A leading fintech firm Industry Type:Fi...,1,0
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,Client:A leading tech firm Industry Type:IT Pr...,1,0
3,bctech2014,https://insights.blackcoffer.com/effective-man...,Client:A leading tech firm Industry Type:IT Pr...,1,0
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,Client:A leading fintech firm Industry Type:Fi...,1,0
...,...,...,...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...,Client:A leading marketing firm Industry Type:...,10,-6
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...,Client:A leading marketing firm Industry Type:...,16,-16
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...,Client:A leading healthcare tech firm Industry...,5,-4
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...,PresentationMapDashboardAPI Integration Kibana...,0,0


In [None]:
# calculate popularity score
df['Popularity_Score'] = (df['Positive_Score'] - df['Negative_Score']) / (df['Positive_Score'] + df['Negative_Score'] + 0.000001)
df

Unnamed: 0,URL_ID,URL,Text,Positive_Score,Negative_Score,Popularity_Score
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,Client:A leading insurance firm worldwide Indu...,30,-8,1.727273e+00
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,Client:A leading fintech firm Industry Type:Fi...,1,0,9.999990e-01
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,Client:A leading tech firm Industry Type:IT Pr...,1,0,9.999990e-01
3,bctech2014,https://insights.blackcoffer.com/effective-man...,Client:A leading tech firm Industry Type:IT Pr...,1,0,9.999990e-01
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,Client:A leading fintech firm Industry Type:Fi...,1,0,9.999990e-01
...,...,...,...,...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...,Client:A leading marketing firm Industry Type:...,10,-6,3.999999e+00
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...,Client:A leading marketing firm Industry Type:...,16,-16,3.200000e+07
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...,Client:A leading healthcare tech firm Industry...,5,-4,8.999991e+00
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...,PresentationMapDashboardAPI Integration Kibana...,0,0,0.000000e+00


In [None]:
# subjective score calculation
df['Word_Count'] = df['Text'].apply(lambda x: len(x.split()))
df['Subjective_Score'] = (df['Positive_Score'] + df['Negative_Score']) / (df['Word_Count'] + 0.000001)
df


Unnamed: 0,URL_ID,URL,Text,Positive_Score,Negative_Score,Popularity_Score,Word_Count,Subjective_Score
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,Client:A leading insurance firm worldwide Indu...,30,-8,1.727273e+00,457,0.048140
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,Client:A leading fintech firm Industry Type:Fi...,1,0,9.999990e-01,47,0.021277
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,Client:A leading tech firm Industry Type:IT Pr...,1,0,9.999990e-01,46,0.021739
3,bctech2014,https://insights.blackcoffer.com/effective-man...,Client:A leading tech firm Industry Type:IT Pr...,1,0,9.999990e-01,48,0.020833
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,Client:A leading fintech firm Industry Type:Fi...,1,0,9.999990e-01,47,0.021277
...,...,...,...,...,...,...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...,Client:A leading marketing firm Industry Type:...,10,-6,3.999999e+00,530,0.007547
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...,Client:A leading marketing firm Industry Type:...,16,-16,3.200000e+07,812,0.000000
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...,Client:A leading healthcare tech firm Industry...,5,-4,8.999991e+00,100,0.010000
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...,PresentationMapDashboardAPI Integration Kibana...,0,0,0.000000e+00,8,0.000000


In [None]:
# average sentence length clculation
df['Sentence_Count'] = df['Text'].apply(lambda x: len(re.split(r'[.!?]', x)))
df['Complex_Words'] = df['Text'].apply(lambda x: len([word for word in x.split() if len(word) > 2]))
df['Average_Sentence_Length'] = df['Word_Count'] / (df['Sentence_Count'] + 0.000001)
df['percentage_of_complex_words'] = (df['Complex_Words'] / (df['Word_Count'] + 0.000001)) * 100
df['fox_index'] = 0.4 * (df['Average_Sentence_Length'] + df['percentage_of_complex_words'])
df

Unnamed: 0,URL_ID,URL,Text,Positive_Score,Negative_Score,Popularity_Score,Word_Count,Subjective_Score,Sentence_Count,Complex_Words,Average_Sentence_Length,percentage_of_complex_words,fox_index
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,Client:A leading insurance firm worldwide Indu...,30,-8,1.727273e+00,457,0.048140,48,446,9.520833,97.592998,42.845532
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,Client:A leading fintech firm Industry Type:Fi...,1,0,9.999990e-01,47,0.021277,8,45,5.874999,95.744679,40.647871
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,Client:A leading tech firm Industry Type:IT Pr...,1,0,9.999990e-01,46,0.021739,8,44,5.749999,95.652172,40.560868
3,bctech2014,https://insights.blackcoffer.com/effective-man...,Client:A leading tech firm Industry Type:IT Pr...,1,0,9.999990e-01,48,0.020833,8,45,5.999999,93.749998,39.899999
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,Client:A leading fintech firm Industry Type:Fi...,1,0,9.999990e-01,47,0.021277,8,45,5.874999,95.744679,40.647871
...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...,Client:A leading marketing firm Industry Type:...,10,-6,3.999999e+00,530,0.007547,39,495,13.589743,93.396226,42.794388
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...,Client:A leading marketing firm Industry Type:...,16,-16,3.200000e+07,812,0.000000,67,746,12.119403,91.871921,41.596530
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...,Client:A leading healthcare tech firm Industry...,5,-4,8.999991e+00,100,0.010000,16,100,6.250000,99.999999,42.499999
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...,PresentationMapDashboardAPI Integration Kibana...,0,0,0.000000e+00,8,0.000000,1,7,7.999992,87.499989,38.199992


In [None]:
df['average-number-of-words-per-sentence'] = df['Word_Count'] / df['Sentence_Count']

In [None]:
df

Unnamed: 0,URL_ID,URL,Text,Positive_Score,Negative_Score,Popularity_Score,Word_Count,Subjective_Score,Sentence_Count,Complex_Words,Average_Sentence_Length,percentage_of_complex_words,fox_index,average-number-of-words-per-sentence
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,Client:A leading insurance firm worldwide Indu...,30,-8,1.727273e+00,457,0.048140,48,446,9.520833,97.592998,42.845532,9.520833
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,Client:A leading fintech firm Industry Type:Fi...,1,0,9.999990e-01,47,0.021277,8,45,5.874999,95.744679,40.647871,5.875000
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,Client:A leading tech firm Industry Type:IT Pr...,1,0,9.999990e-01,46,0.021739,8,44,5.749999,95.652172,40.560868,5.750000
3,bctech2014,https://insights.blackcoffer.com/effective-man...,Client:A leading tech firm Industry Type:IT Pr...,1,0,9.999990e-01,48,0.020833,8,45,5.999999,93.749998,39.899999,6.000000
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,Client:A leading fintech firm Industry Type:Fi...,1,0,9.999990e-01,47,0.021277,8,45,5.874999,95.744679,40.647871,5.875000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...,Client:A leading marketing firm Industry Type:...,10,-6,3.999999e+00,530,0.007547,39,495,13.589743,93.396226,42.794388,13.589744
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...,Client:A leading marketing firm Industry Type:...,16,-16,3.200000e+07,812,0.000000,67,746,12.119403,91.871921,41.596530,12.119403
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...,Client:A leading healthcare tech firm Industry...,5,-4,8.999991e+00,100,0.010000,16,100,6.250000,99.999999,42.499999,6.250000
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...,PresentationMapDashboardAPI Integration Kibana...,0,0,0.000000e+00,8,0.000000,1,7,7.999992,87.499989,38.199992,8.000000


In [None]:

df['char'] = df['Text'].apply(lambda x: len(x))
df['Averge_Word_Length'] = df['char'] / (df['Word_Count'] + 0.000001)
df

Unnamed: 0,URL_ID,URL,Text,Positive_Score,Negative_Score,Popularity_Score,Word_Count,Subjective_Score,Sentence_Count,Complex_Words,Average_Sentence_Length,percentage_of_complex_words,fox_index,average-number-of-words-per-sentence,char,Averge_Word_Length
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,Client:A leading insurance firm worldwide Indu...,30,-8,1.727273e+00,457,0.048140,48,446,9.520833,97.592998,42.845532,9.520833,4233,9.262582
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,Client:A leading fintech firm Industry Type:Fi...,1,0,9.999990e-01,47,0.021277,8,45,5.874999,95.744679,40.647871,5.875000,490,10.425532
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,Client:A leading tech firm Industry Type:IT Pr...,1,0,9.999990e-01,46,0.021739,8,44,5.749999,95.652172,40.560868,5.750000,468,10.173913
3,bctech2014,https://insights.blackcoffer.com/effective-man...,Client:A leading tech firm Industry Type:IT Pr...,1,0,9.999990e-01,48,0.020833,8,45,5.999999,93.749998,39.899999,6.000000,485,10.104166
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,Client:A leading fintech firm Industry Type:Fi...,1,0,9.999990e-01,47,0.021277,8,45,5.874999,95.744679,40.647871,5.875000,493,10.489361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...,Client:A leading marketing firm Industry Type:...,10,-6,3.999999e+00,530,0.007547,39,495,13.589743,93.396226,42.794388,13.589744,4021,7.586792
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...,Client:A leading marketing firm Industry Type:...,16,-16,3.200000e+07,812,0.000000,67,746,12.119403,91.871921,41.596530,12.119403,6219,7.658867
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...,Client:A leading healthcare tech firm Industry...,5,-4,8.999991e+00,100,0.010000,16,100,6.250000,99.999999,42.499999,6.250000,846,8.460000
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...,PresentationMapDashboardAPI Integration Kibana...,0,0,0.000000e+00,8,0.000000,1,7,7.999992,87.499989,38.199992,8.000000,114,14.249998


In [None]:
#  extract personal pronouns
import re

def count_personal_pronouns(text):
    # List of personal pronouns to search for
    pronouns = [
        "I", "me", "my", "mine", "myself",
        "you", "your", "yours", "yourself", "yourselves",
        "he", "him", "his", "himself",
        "she", "her", "hers", "herself",
        "it", "its", "itself",
        "we", "us", "our", "ours", "ourselves",
        "they", "them", "their", "theirs", "themselves"
    ]

    # Convert text to lowercase for case-insensitive matching
    text = text.lower()

    # Count all pronouns in the text
    total_count = 0
    for pronoun in pronouns:
        # Use word boundaries to ensure we're matching whole words
        count = len(re.findall(r'\b' + re.escape(pronoun) + r'\b', text))

        # Handle exceptions (currently only for "us")
        if pronoun == "us":
            # Subtract occurrences of "US" referring to United States
            us_as_country = len(re.findall(r'\b(united states|u\.s\.)\b', text))
            count = max(0, count - us_as_country)

        total_count += count

    return total_count

df['Personal_Pronouns'] = df['Text'].apply(count_personal_pronouns)
df

Unnamed: 0,URL_ID,URL,Text,Positive_Score,Negative_Score,Popularity_Score,Word_Count,Subjective_Score,Sentence_Count,Complex_Words,Average_Sentence_Length,percentage_of_complex_words,fox_index,average-number-of-words-per-sentence,char,Averge_Word_Length,Personal_Pronouns
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,Client:A leading insurance firm worldwide Indu...,30,-8,1.727273e+00,457,0.048140,48,446,9.520833,97.592998,42.845532,9.520833,4233,9.262582,1
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,Client:A leading fintech firm Industry Type:Fi...,1,0,9.999990e-01,47,0.021277,8,45,5.874999,95.744679,40.647871,5.875000,490,10.425532,0
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,Client:A leading tech firm Industry Type:IT Pr...,1,0,9.999990e-01,46,0.021739,8,44,5.749999,95.652172,40.560868,5.750000,468,10.173913,2
3,bctech2014,https://insights.blackcoffer.com/effective-man...,Client:A leading tech firm Industry Type:IT Pr...,1,0,9.999990e-01,48,0.020833,8,45,5.999999,93.749998,39.899999,6.000000,485,10.104166,1
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,Client:A leading fintech firm Industry Type:Fi...,1,0,9.999990e-01,47,0.021277,8,45,5.874999,95.744679,40.647871,5.875000,493,10.489361,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...,Client:A leading marketing firm Industry Type:...,10,-6,3.999999e+00,530,0.007547,39,495,13.589743,93.396226,42.794388,13.589744,4021,7.586792,1
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...,Client:A leading marketing firm Industry Type:...,16,-16,3.200000e+07,812,0.000000,67,746,12.119403,91.871921,41.596530,12.119403,6219,7.658867,1
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...,Client:A leading healthcare tech firm Industry...,5,-4,8.999991e+00,100,0.010000,16,100,6.250000,99.999999,42.499999,6.250000,846,8.460000,0
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...,PresentationMapDashboardAPI Integration Kibana...,0,0,0.000000e+00,8,0.000000,1,7,7.999992,87.499989,38.199992,8.000000,114,14.249998,0


In [None]:
import nltk
from nltk.corpus import cmudict

nltk.download('cmudict', quiet=True)
d = cmudict.dict()

def count_syllables(word):
    word = word.lower()

    # Check if the word is in the CMU dictionary
    if word in d:
        return max([len([y for y in x if y[-1].isdigit()]) for x in d[word]])

    # If the word is not in the dictionary, use the fallback method
    return fallback_syllable_count(word)

def fallback_syllable_count(word):
    word = word.lower()
    count = 0
    vowels = 'aeiouy'

    # Handle special cases
    if len(word) <= 3:
        return 1

    # Handle common endings
    if word.endswith('es') or word.endswith('ed'):
        # Remove 'es' or 'ed'
        word = word[:-2]
    elif word.endswith('e'):
        # Remove 'e' unless the word ends with 'le'
        if not word.endswith('le'):
            word = word[:-1]

    # Count vowel groups
    prev_char_was_vowel = False
    for char in word:
        if char in vowels:
            if not prev_char_was_vowel:
                count += 1
            prev_char_was_vowel = True
        else:
            prev_char_was_vowel = False

    # Handle special cases where counting vowel groups doesn't work well
    if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
        count += 1

    # Ensure at least one syllable
    return max(1, count)

df['Syllable_Count'] = df['Text'].apply(count_syllables)
df

Unnamed: 0,URL_ID,URL,Text,Positive_Score,Negative_Score,Popularity_Score,Word_Count,Subjective_Score,Sentence_Count,Complex_Words,Average_Sentence_Length,percentage_of_complex_words,fox_index,average-number-of-words-per-sentence,char,Averge_Word_Length,Personal_Pronouns,Syllable_Count
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,Client:A leading insurance firm worldwide Indu...,30,-8,1.727273e+00,457,0.048140,48,446,9.520833,97.592998,42.845532,9.520833,4233,9.262582,1,1274
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,Client:A leading fintech firm Industry Type:Fi...,1,0,9.999990e-01,47,0.021277,8,45,5.874999,95.744679,40.647871,5.875000,490,10.425532,0,123
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,Client:A leading tech firm Industry Type:IT Pr...,1,0,9.999990e-01,46,0.021739,8,44,5.749999,95.652172,40.560868,5.750000,468,10.173913,2,117
3,bctech2014,https://insights.blackcoffer.com/effective-man...,Client:A leading tech firm Industry Type:IT Pr...,1,0,9.999990e-01,48,0.020833,8,45,5.999999,93.749998,39.899999,6.000000,485,10.104166,1,121
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,Client:A leading fintech firm Industry Type:Fi...,1,0,9.999990e-01,47,0.021277,8,45,5.874999,95.744679,40.647871,5.875000,493,10.489361,0,124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...,Client:A leading marketing firm Industry Type:...,10,-6,3.999999e+00,530,0.007547,39,495,13.589743,93.396226,42.794388,13.589744,4021,7.586792,1,1177
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...,Client:A leading marketing firm Industry Type:...,16,-16,3.200000e+07,812,0.000000,67,746,12.119403,91.871921,41.596530,12.119403,6219,7.658867,1,1830
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...,Client:A leading healthcare tech firm Industry...,5,-4,8.999991e+00,100,0.010000,16,100,6.250000,99.999999,42.499999,6.250000,846,8.460000,0,226
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...,PresentationMapDashboardAPI Integration Kibana...,0,0,0.000000e+00,8,0.000000,1,7,7.999992,87.499989,38.199992,8.000000,114,14.249998,0,37


In [None]:
# convert the df into xlsx file save to google drive
df.to_excel('Output_Data_Structure.xlsx', index=False)
from google.colab import files
files.download('Output_Data_Structure.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>