In [3]:
import pandas as pd

# Path to your text file
txt_file = "/content/Sarcasm_tweets_with_language.txt"
csv_file = "/content/Sarcasm_tweets_with_language.csv"

# Initialize storage
tweets = {}

with open(txt_file, "r", encoding="utf-8") as file:
    current_id = None
    words, langs = [], []

    for line in file:
        line = line.strip()

        # If the line is a Tweet ID (numeric), start a new tweet
        if line.isdigit():
            if current_id:  # Store the previous tweet
                tweets[current_id] = {"Text": " ".join(words), "Languages": " ".join(langs)}
            current_id = line
            words, langs = [], []
        else:
            parts = line.rsplit(" ", 1)  # Split word and lang
            if len(parts) == 2:
                word, lang = parts
                words.append(word)
                langs.append(lang)

    # Store the last tweet
    if current_id:
        tweets[current_id] = {"Text": " ".join(words), "Languages": " ".join(langs)}

# Convert to DataFrame
df = pd.DataFrame.from_dict(tweets, orient="index").reset_index()
df.columns = ["Tweet ID", "Text", "Languages"]

# Save to CSV
df.to_csv(csv_file, index=False, encoding="utf-8")

print(f"✅ CSV file saved at {csv_file}")


✅ CSV file saved at /content/Sarcasm_tweets_with_language.csv


In [4]:
import pandas as pd
from collections import Counter

# Load your CSV file
csv_file = "/content/Sarcasm_tweets_with_language.csv"  # Update the path if needed
df = pd.read_csv(csv_file)

# Function to calculate CMI for each tweet
def calculate_cmi(langs):
    lang_counts = Counter(langs.split())  # Count occurrences of each language
    N = sum(lang_counts.values())  # Total number of words
    f_m = max(lang_counts.values())  # Frequency of the most used language
    u = 0  # Unique word correction (usually 0)

    if N == u:  # Avoid division by zero
        return 0

    return round((1 - (f_m / (N - u))) * 100, 2)

# Apply function to calculate CMI for each tweet
df["CMI"] = df["Languages"].apply(calculate_cmi)

# Save the updated CSV file
df.to_csv("/content/Sarcasm_tweets_with_cmi.csv", index=False, encoding="utf-8")

print("✅ CMI calculated and saved successfully!")


✅ CMI calculated and saved successfully!
