***Importing Libraries***

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string



***Reading & Storing Data in Chunks***

In [None]:
def read_csv_and_store_chunks(input_file, chunk_size=5000000):
    num_chunks=0
    chunks = []
    try:
        # Iterate over the CSV file in chunks
        for i, chunk in enumerate(pd.read_csv(input_file, chunksize=chunk_size,encoding='latin-1')):
            # Process each chunk as needed
            chunk.dropna(inplace=True)  # Drop rows with missing values
            chunk['SECTION_TEXT'] = chunk['SECTION_TEXT'].astype(str).apply(preprocess_text)
            chunks.append(chunk)  # Append the processed chunk to the list
            print(f"Processed chunk {i+1}")
    except Exception as e:
        print(f"Error reading CSV: {e}")
    if chunks:
        return pd.concat(chunks, ignore_index=True)  # Concatenate all chunks into a single DataFrame
    else:
        print("No valid data chunks found")
        return None

# Read CSV file in chunks and store its content in a DataFrame
input_data_df = read_csv_and_store_chunks("/kaggle/input/search-engine-asst2/enwiki-20170820.csv", chunk_size=1000)

Processed chunk 1
Processed chunk 2
Processed chunk 3
Processed chunk 4
Processed chunk 5
Processed chunk 6
Processed chunk 7
Processed chunk 8
Processed chunk 9
Processed chunk 10
Processed chunk 11
Processed chunk 12
Processed chunk 13
Processed chunk 14
Processed chunk 15
Processed chunk 16
Processed chunk 17
Processed chunk 18
Processed chunk 19
Processed chunk 20
Processed chunk 21
Processed chunk 22
Processed chunk 23
Processed chunk 24
Processed chunk 25
Processed chunk 26
Processed chunk 27
Processed chunk 28
Processed chunk 29
Processed chunk 30
Processed chunk 31
Processed chunk 32
Processed chunk 33
Processed chunk 34
Processed chunk 35
Processed chunk 36
Processed chunk 37
Processed chunk 38
Processed chunk 39
Processed chunk 40
Processed chunk 41
Processed chunk 42
Processed chunk 43
Processed chunk 44
Processed chunk 45
Processed chunk 46
Processed chunk 47
Processed chunk 48
Processed chunk 49
Processed chunk 50
Processed chunk 51
Processed chunk 52
Processed chunk 53
Pr

***Displaying first 100 rows***

In [14]:
if input_data_df is not None:
    # Display the first 100 rows of the DataFrame
    print(input_data_df.head(100))
else:
    print("No valid data DataFrame generated")

    ARTICLE_ID      TITLE                 SECTION_TITLE  \
0            0  Anarchism                  Introduction   
1            0  Anarchism     Etymology and terminology   
2            0  Anarchism                       History   
3            0  Anarchism  Anarchist schools of thought   
4            0  Anarchism   Internal issues and debates   
..         ...        ...                           ...   
95           7  Aristotle                 List of works   
96           7  Aristotle                       Eponyms   
97           7  Aristotle                      See also   
98           7  Aristotle          Notes and references   
99           7  Aristotle               Further reading   

                                         SECTION_TEXT  
0   '' anarch '' polit philosophi advoc self-gover...  
1   term ''anarch '' compound word compos word ''a...  
2   ===origins=== woodcut digger document william ...  
3   portrait philosoph pierre-joseph proudhon 1809...  
4   consist

***Pre-processing***

In [3]:
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Custom list of stopwords
custom_stopwords = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves",
    "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was",
    "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and",
    "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between",
    "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off",
    "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both",
    "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too",
    "very", "s", "t", "can", "will", "just", "don", "should", "now"
])

def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Removing Punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    # Removing Stopwords
    tokens = [token for token in tokens if token not in custom_stopwords]

    # Stemming or Lemmatization (only on unique words)
    stemmer = PorterStemmer()
    unique_words = set(tokens)
    stemmed_words = [stemmer.stem(word) for word in unique_words]

    # Create a mapping of original words to stemmed words
    stemmed_word_map = dict(zip(unique_words, stemmed_words))

    # Replace each token with its stemmed version
    tokens = [stemmed_word_map[token] for token in tokens]

    return ' '.join(tokens)  # Return preprocessed text as a single string


***File Writing***

In [16]:
def write_to_text_file(df, output_file):
    with open(output_file, 'w') as f:
        # Variables to store article_id and appended_paragraph
        current_article_id = None
        appended_paragraph = ""

        # Iterate over DataFrame rows
        for index, row in df.iterrows():
            article_id = row['ARTICLE_ID']
            section_text = row['SECTION_TEXT']

            # Check if the section text is not a heading
            if section_text.strip() and not section_text.strip().startswith('='):
                # If current article ID is different from previous one, write the appended paragraph to file
                if article_id != current_article_id and current_article_id is not None:
                    f.write(f"{current_article_id},{appended_paragraph}\n")  # Write the appended paragraph to file
                    appended_paragraph = ""  # Reset appended paragraph for the new article ID

                # Append section text to the paragraph
                appended_paragraph += section_text.strip() + ' '  # Add space between sections

                # Update current article ID
                current_article_id = article_id

        # Write the last appended paragraph to file
        if appended_paragraph:
            f.write(f"{current_article_id},{appended_paragraph}\n")  # Write the appended paragraph to file

write_to_text_file(input_data_df, "input.txt")
