In [3]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

backtranslate = '/content/drive/MyDrive/deeplearningProgression/backtranslate.csv'
backtranslate = pd.read_csv(backtranslate)

print(backtranslate.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                                               title  \
0  law enforcement high alert following threat co...   
1                                            missing   
2  unbelievable obamas attorney general say charl...   
3  bobby jindal raised hindu us story christian c...   
4  satan 2 russia unvelis image terrifying new su...   

                                                text  label  title_missing  \
0  comment expected barack obama member fyf911 fu...      1          False   
1                          post vote hillary already      1           True   
2  demonstrator gathered last night exercising co...      1          False   
3  dozen politically active pastor came private d...      0          False   
4  rs28 sarmat missile dubbed satan 2 replace ss1...      1          False   

   text_missing                                       title_token

In [18]:
# 1. Remove any unnamed columns (if present)
backtranslate = backtranslate.loc[:, ~backtranslate.columns.str.contains('^Unnamed')]

In [19]:
# 2. Handle missing values for `back_translated_text`
backtranslate['back_translated_text_missing'] = backtranslate['back_translated_text'].isna() | backtranslate['back_translated_text'].eq('')
backtranslate['back_translated_text'] = backtranslate['back_translated_text'].replace('', 'missing')
backtranslate['back_translated_text'] = backtranslate['back_translated_text'].fillna('missing')

In [20]:
# 3. Convert all text in `back_translated_text` to lowercase
backtranslate['back_translated_text'] = backtranslate['back_translated_text'].str.lower()

In [21]:
import re
import string

# 4. Remove punctuation
def remove_punctuation(text):
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text

backtranslate['back_translated_text'] = backtranslate['back_translated_text'].apply(remove_punctuation)

In [22]:
# Download necessary NLTK models and corpora
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [23]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# 5. Remove stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

backtranslate['back_translated_text'] = backtranslate['back_translated_text'].apply(remove_stopwords)

In [24]:
# 6. Tokenization
def tokenize_text(text):
    return [word for word in word_tokenize(text) if word.isalnum()]

# Create a new column for tokenized text
backtranslate['back_translated_text_tokens'] = backtranslate['back_translated_text'].apply(tokenize_text)

In [25]:
from nltk.stem import WordNetLemmatizer
# 7. Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    word_tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(word.lower()) for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(lemmatized_tokens)

backtranslate['back_translated_text'] = backtranslate['back_translated_text'].apply(lemmatize_text)


save and check

In [26]:
# Save the processed dataset to a new CSV file
backtranslate.to_csv('backtranslate_processed.csv', index=False)

# Check the quality of the cleaned dataset
print("\nProcessed Data:")
print(backtranslate[['back_translated_text', 'back_translated_text_tokens']].head())

# Print the lengths
print(f"\nTotal number of rows: {len(backtranslate)}")
print(f"Length of tokenized text column: {backtranslate['back_translated_text_tokens'].apply(len).sum()}")


Processed Data:
                                back_translated_text  \
0  wait comment barack obama member fyf911 fukyof...   
1                                      voted hillary   
2  protester gathered last night exercise constit...   
3  dozen politically active pastor came private d...   
4  rs28 sarmat missile called satan 2 replaces ss...   

                         back_translated_text_tokens  
0  [wait, comment, barack, obama, member, fyf911,...  
1                                   [voted, hillary]  
2  [protester, gathered, last, night, exercise, c...  
3  [dozen, politically, active, pastor, came, pri...  
4  [rs28, sarmat, missile, called, satan, 2, repl...  

Total number of rows: 29995
Length of tokenized text column: 8654262
