In [18]:
import pandas as pd
import re
import spacy
from sklearn.feature_extraction.text import CountVectorizer

# Load the CSV file
# Replace 'your_file.csv' with the path to your CSV file in Google Colab
df = pd.read_csv('/content/drive/MyDrive/Dataset/updated_tweets.csv')

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')


In [24]:
# Ensure that all values in the 'text' column are strings
df['text'] = df['text'].fillna('').astype(str)

In [25]:
def get_word_count(text):
    return len(text.split())

def get_max_word_length(text):
    return max(len(word) for word in text.split()) if text else 0

def get_min_word_length(text):
    return min(len(word) for word in text.split()) if text else 0

def get_average_word_length(text):
    words = text.split()
    return sum(len(word) for word in words) / len(words) if words else 0


In [26]:
def get_part_of_speech_count(text, pos_tag):
    doc = nlp(text)
    return sum(1 for token in doc if token.pos_ == pos_tag)

def get_named_entity_percentage(text, entity_label):
    doc = nlp(text)
    entities = [ent.label_ for ent in doc.ents]
    total_entities = len(entities)
    return entities.count(entity_label) / total_entities * 100 if total_entities > 0 else 0


In [27]:
def get_punctuation_count(text, punctuation):
    return text.count(punctuation)


In [28]:
# Apply functions to extract features and add them as new columns
df['Word count'] = df['text'].apply(get_word_count)
df['Max word length'] = df['text'].apply(get_max_word_length)
df['Min word length'] = df['text'].apply(get_min_word_length)
df['Average word length'] = df['text'].apply(get_average_word_length)
df['present_verbs'] = df['text'].apply(lambda text: get_part_of_speech_count(text, 'VERB'))
df['adjectives'] = df['text'].apply(lambda text: get_part_of_speech_count(text, 'ADJ'))
df['adverbs'] = df['text'].apply(lambda text: get_part_of_speech_count(text, 'ADV'))
df['dots'] = df['text'].apply(lambda text: get_punctuation_count(text, '.'))
df['exclamation'] = df['text'].apply(lambda text: get_punctuation_count(text, '!'))
df['questions'] = df['text'].apply(lambda text: get_punctuation_count(text, '?'))

# Named Entity Recognition (NER) percentages
entities = ['ORG', 'NORP', 'GPE', 'PERSON', 'MONEY', 'DATE', 'CARDINAL', 'PERCENT', 'ORDINAL',
            'FAC', 'LAW', 'PRODUCT', 'EVENT', 'TIME', 'LOC', 'WORK_OF_ART', 'QUANTITY', 'LANGUAGE']

for entity in entities:
    df[f'{entity}_percentage'] = df['text'].apply(lambda text: get_named_entity_percentage(text, entity))


In [29]:
# Save the updated DataFrame to a new CSV file
df.to_csv('updated_file_bothu.csv', index=False)

# Download the updated file
from google.colab import files
files.download('updated_file_bothu.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [30]:
data1=pd.read_csv('/content/drive/MyDrive/Dataset/updated_file_bothu.csv')

In [31]:
data1.columns

Index(['date', 'text', 'link', 'likes', 'retweets', 'comments', 'user_name',
       'user_username', 'user_avatar', 'label', 'mentions', 'word_count',
       'adjectives', 'adverbs', 'pronouns', 'capitals', 'digits', 'hashtags',
       'URLs', 'ORG_percentage', 'PERSON_percentage', 'Word count',
       'Max word length', 'Min word length', 'Average word length',
       'present_verbs', 'dots', 'exclamation', 'questions', 'NORP_percentage',
       'GPE_percentage', 'MONEY_percentage', 'DATE_percentage',
       'CARDINAL_percentage', 'PERCENT_percentage', 'ORDINAL_percentage',
       'FAC_percentage', 'LAW_percentage', 'PRODUCT_percentage',
       'EVENT_percentage', 'TIME_percentage', 'LOC_percentage',
       'WORK_OF_ART_percentage', 'QUANTITY_percentage', 'LANGUAGE_percentage'],
      dtype='object')