In [None]:
import numpy as np
import pandas as pd

import os
import nltk

import re
import textwrap

import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

## Load Data

In [None]:
#df = pd.read_csv("/content/drive/My Drive/NLP_Final_Project/DATA/urban_dict_wo_dupes.csv")
df = pd.read_csv("/content/drive/My Drive/DATA/urban_dict_wo_dupes.csv")
pd.set_option('display.max_columns', None)

In [None]:
df.drop('current_vote', axis=1, inplace=True)
df.head()

In [None]:
def clean_text(text):
    text = str(text)
    # Remove special characters like "</s>"
    text = re.sub(r"</s>\d*,", "", text)

    # Remove numeric characters and parentheses
    text = re.sub(r"[0-9]+", "", text)  # Removes all numbers
    text = re.sub(r"[()]", "", text)  # Removes parentheses
    text = re.sub(r"\t", "", text)  # Removes tabs
    text = re.sub(r"[\r\n]", " ", text)  # Removes new lines
    text = re.sub(r"[*#_]", "", text)  # Removes some non-standard punctuation

    # Replace common typos
    corrections = {
        " teh ": " the ",
        " u " : " you ",
        " adn " : " and ",
        " tho " : " though ",
        " . " : " ",
        # Add more corrections here if needed
    }
    for wrong, right in corrections.items():
        text = text.replace(wrong, right)

    # Remove extra quotation marks and correct double spaces
    text = text.replace('""', '"').replace("  ", " ")

    # Trim leading and trailing whitespace
    text = text.strip()

    return text

In [None]:
def extract_brackets(text):
    '''
    Extracts all text within square brackets
    :param text:
    :return list of bracketed text:
    '''
    return re.findall(r'\[([^\]]*)\]', text)

extract_brackets(df['definition'][0])

In [None]:
df['definition'].fillna('', inplace=True)
df['example'].fillna('', inplace=True)
df['author'].fillna('', inplace=True)
df['word'].fillna('', inplace=True)

In [None]:
df['bracketed_dfn'] = df['definition'].apply(extract_brackets)

In [None]:
df['bracketed_exmpl'] = df['example'].apply(extract_brackets)

## Clean Text

In [None]:
df['definition'] = df['definition'].apply(clean_text)
df['example'] = df['example'].apply(clean_text)

In [None]:
df['vote_ratio'] = df['thumbs_up'] / (df['thumbs_up'] + df['thumbs_down'])

In [None]:
df.head()

In [None]:
df['vote_diff'] = df['thumbs_up'] - df['thumbs_down']

In [None]:
print(len(df))

In [None]:
print(len(df[df['vote_diff'] >= 400]))

In [None]:
df2 = df[df['vote_diff'] >= 400]

In [None]:
len(df) - len(df2)

In [None]:
df2['definition'] = df2.apply(lambda x: x['definition'].replace('[', ''), axis=1)
df2['definition'] = df2.apply(lambda x: x['definition'].replace(']', ''), axis=1)
df2['example'] = df2.apply(lambda x: x['example'].replace('[', ''), axis=1)
df2['example'] = df2.apply(lambda x: x['example'].replace(']', ''), axis=1)

In [None]:
df2.head()

In [None]:
df2['compare'] = df2.apply(lambda x: 'Yes' if x['word'].lower() in x['example'].lower() else 'No', axis=1)

In [None]:
df2.head()

In [None]:
len(df2[df2['compare'] == 'Yes'])

In [None]:
len(df2) - len(df2[df2['compare'] == 'Yes'])

In [None]:
df2 = df2[df2['compare'] == 'Yes']

In [None]:
# Truncate 'definition' column
df2['definition'] = df2['definition'].str[:256]

# Truncate 'example' column
df2['example'] = df2['example'].str[:256]

In [None]:
df2['compare2'] = df2.apply(lambda x: 'Yes' if x['word'].lower() in x['example'].lower() else 'No', axis=1)

In [None]:
len(df2[df2['compare2'] == 'Yes'])

In [None]:
len(df2) - len(df2[df2['compare2'] == 'Yes'])

In [None]:
df2 = df2[df2['compare2'] == 'Yes']

In [None]:
df2['question'] = df2.apply(lambda row: f"What is the meaning of {row['word']} in the following example sentence?: {row['example']}", axis=1)

df2.head()

In [None]:
df2['question'] = df2['question'].str[:256]

df2['compare3'] = df2.apply(lambda x: 'Yes' if x['question'].lower().count(x['word'].lower()) >= 2 else 'No', axis=1)

In [None]:
len(df2[df2['compare3'] == 'No'])

In [None]:
len(df2[df2['compare3'] == 'Yes'])

In [None]:
len(df2) - len(df2[df2['compare3'] == 'Yes'])

In [None]:
df2 = df2[df2['compare3'] == 'Yes']

In [None]:
df2['masked_example'] = df2.apply(lambda x: x['example'].lower().replace(x['word'].lower(),'<extra_id_0>'), axis=1)

In [None]:
df2.head()

In [None]:
print(len(df2[df2['masked_example'].str.contains('<extra_id_0>')]))

In [None]:
print(min(df2['vote_diff']))
print(min(df2['vote_ratio']))

In [None]:
df2['word_lower'] = df2['word'].str.lower()

# Sort the DataFrame by 'vote_ratio' in descending order
df2 = df2.sort_values(by='vote_diff', ascending=False)

# Drop duplicate rows, keeping only the first occurrence (highest 'vote_ratio' within each lowercase word)
df2 = df2.drop_duplicates('word_lower')

# Drop the temporary 'word_lower' column if needed
df2 = df2.drop(columns=['word_lower'])

In [None]:
df2.head()

In [None]:
df_model = df2.drop(['thumbs_up', 'thumbs_down', 'bracketed_dfn', 'bracketed_exmpl', 'compare'], axis=1)
df_model = df_model.sample(frac=1, random_state=1).reset_index(drop=True)
df_model.shape

In [None]:
# Assuming your DataFrame is named df
# df2.to_csv('urban_dict_filtered_v2.csv', index=False)

# Download the CSV file

# files.download('urban_dict_filtered_v2.csv')