In [None]:
 pip install tqdm unidecode

In [None]:
import nltk
import re
import string
import unidecode
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Ensure NLTK resources are available
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Load stopwords
stop_words = set(stopwords.words("english"))

# Initialize lemmatizer and stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# List of crypto-related terms to keep intact
crypto_terms = {"bitcoin", "btc", "ethereum", "eth", "blockchain", "crypto", "nft", "defi", "web3", "dao", "altcoin","solana","hamster","dogecoin"
                "stablecoin", "smart contract", "staking", "mining", "wallet", "coin", "token", "airdrop", "fomo", "hodl"}

def preprocess_sentence(sentence):
    """ Clean, tokenize, stem, and lemmatize a sentence while preserving crypto terms. """
    
    # Convert to lowercase
    sentence = sentence.lower()

    # Remove unwanted special characters (like â€™, emojis, etc.)
    sentence = unidecode.unidecode(sentence)  # Normalize special characters
    sentence = re.sub(r"[^\x00-\x7F]+", " ", sentence)  # Remove non-ASCII characters
    sentence = re.sub(r"[^a-zA-Z0-9\s]", "", sentence)  # Remove punctuations except spaces
    
    # Tokenize words
    words = word_tokenize(sentence)
    
    cleaned_words = []
    for word in words:
        if word in crypto_terms:  # Preserve crypto-related words
            cleaned_words.append(word)
        elif word not in stop_words:
            stemmed_word = stemmer.stem(word)  # Apply stemming
            lemmatized_word = lemmatizer.lemmatize(stemmed_word)  # Apply lemmatization
            cleaned_words.append(lemmatized_word)

    return " ".join(cleaned_words)

def preprocess_text(sentences):
    """ Process a list of sentences efficiently and provide real-time updates. """
    cleaned_sentences = []
    
    # Process each sentence with tqdm progress bar
    for i, sentence in enumerate(tqdm(sentences, desc="Processing Sentences", unit="sent")):
        cleaned_sentences.append(preprocess_sentence(sentence))

        # Print progress every 10,000 sentences
        if (i + 1) % 10000 == 0:
            print(f"Processed {i+1}/{len(sentences)} sentences...")

    return cleaned_sentences

# Load text file
with open("output.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Split into sentences (faster than using nltk's sent_tokenize)
sentences = text.split("\n")

print(f"Total sentences to process: {len(sentences)}")

# Process text
cleaned_sentences = preprocess_text(sentences)

# Save cleaned sentences
with open("cleaned_data.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(cleaned_sentences))

print("✅ Cleaning complete! File saved as cleaned_data.txt")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lucku\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lucku\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lucku\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Total sentences to process: 102816


Processing Sentences:  10%|▉         | 10238/102816 [00:05<00:41, 2253.09sent/s]

Processed 10000/102816 sentences...


Processing Sentences:  20%|█▉        | 20407/102816 [00:10<00:36, 2244.82sent/s]

Processed 20000/102816 sentences...


Processing Sentences:  29%|██▉       | 30262/102816 [00:14<00:30, 2407.94sent/s]

Processed 30000/102816 sentences...


Processing Sentences:  39%|███▉      | 40426/102816 [00:18<00:25, 2481.18sent/s]

Processed 40000/102816 sentences...


Processing Sentences:  49%|████▉     | 50487/102816 [00:22<00:21, 2416.42sent/s]

Processed 50000/102816 sentences...


Processing Sentences:  59%|█████▊    | 60237/102816 [00:26<00:19, 2185.04sent/s]

Processed 60000/102816 sentences...


Processing Sentences:  69%|██████▊   | 70444/102816 [00:31<00:13, 2466.99sent/s]

Processed 70000/102816 sentences...


Processing Sentences:  78%|███████▊  | 80437/102816 [00:35<00:10, 2183.76sent/s]

Processed 80000/102816 sentences...


Processing Sentences:  88%|████████▊ | 90372/102816 [00:40<00:05, 2119.56sent/s]

Processed 90000/102816 sentences...


Processing Sentences:  98%|█████████▊| 100290/102816 [00:45<00:01, 2114.29sent/s]

Processed 100000/102816 sentences...


Processing Sentences: 100%|██████████| 102816/102816 [00:46<00:00, 2201.09sent/s]

✅ Cleaning complete! File saved as cleaned_data.txt



