***Importing Libraries***

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

***Reading & Storing Data in Chunks***

In [None]:
def read_csv_and_store_chunks(input_file, chunk_size=5000):
    chunks = []
    try:
        # Iterate over the CSV file in chunks
        for i, chunk in enumerate(pd.read_csv(input_file, chunksize=chunk_size, error_bad_lines=False,encoding='latin-1')):
            # Process each chunk as needed
            chunk.dropna(inplace=True)  # Drop rows with missing values
            chunk['SECTION_TEXT'] = chunk['SECTION_TEXT'].astype(str).apply(preprocess_text)
            chunks.append(chunk)  # Append the processed chunk to the list
            print(f"Processed chunk {i+1}")
    except Exception as e:
        print(f"Error reading CSV: {e}")
    if chunks:
        return pd.concat(chunks, ignore_index=True)  # Concatenate all chunks into a single DataFrame
    else:
        print("No valid data chunks found")
        return None

# Read CSV file in chunks and store its content in a DataFrame
input_data_df = read_csv_and_store_chunks("/content/enwiki-20170820.csv", chunk_size=1000)



  for i, chunk in enumerate(pd.read_csv(input_file, chunksize=chunk_size, error_bad_lines=False,encoding='latin-1')):


Processed chunk 1
Processed chunk 2
Processed chunk 3
Processed chunk 4
Processed chunk 5
Processed chunk 6
Processed chunk 7
Processed chunk 8
Processed chunk 9
Processed chunk 10
Processed chunk 11
Processed chunk 12
Processed chunk 13
Processed chunk 14
Processed chunk 15
Processed chunk 16
Processed chunk 17
Processed chunk 18
Processed chunk 19
Processed chunk 20
Processed chunk 21
Processed chunk 22
Processed chunk 23
Processed chunk 24
Processed chunk 25
Processed chunk 26
Processed chunk 27
Processed chunk 28
Processed chunk 29
Processed chunk 30
Processed chunk 31
Processed chunk 32
Processed chunk 33
Processed chunk 34
Processed chunk 35
Processed chunk 36
Processed chunk 37
Processed chunk 38
Processed chunk 39
Processed chunk 40
Processed chunk 41
Processed chunk 42
Processed chunk 43
Processed chunk 44
Processed chunk 45
Processed chunk 46
Processed chunk 47
Processed chunk 48
Processed chunk 49
Processed chunk 50
Processed chunk 51
Processed chunk 52
Processed chunk 53
Pr

***Displaying first 100 rows***

In [None]:
if input_data_df is not None:
    # Display the first 100 rows of the DataFrame
    print(input_data_df.head(100))
else:
    print("No valid data DataFrame generated")

   ARTICLE_ID                 TITLE  \
0           0             Anarchism   
1           0             Anarchism   
2           0             Anarchism   
3          21  Agricultural science   
4          21  Agricultural science   
..        ...                   ...   
95         60          Alkali metal   
96         61              Alphabet   
97         61              Alphabet   
98         61              Alphabet   
99         61              Alphabet   

                                    SECTION_TITLE  \
0                                    Introduction   
1                       Etymology and terminology   
2                                         History   
3                                    Introduction   
4   Agriculture agricultural science and agronomy   
..                                            ...   
95                                     References   
96                                   Introduction   
97                                      Etymology   
9

***Pre-processing***

In [None]:
def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Removing Punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    # Removing Stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming or Lemmatization (only on unique words)
    stemmer = PorterStemmer()
    unique_words = set(tokens)
    stemmed_words = [stemmer.stem(word) for word in unique_words]

    # Create a mapping of original words to stemmed words
    stemmed_word_map = dict(zip(unique_words, stemmed_words))

    # Replace each token with its stemmed version
    tokens = [stemmed_word_map[token] for token in tokens]

    return ' '.join(tokens)  # Return preprocessed text as a single string


***File Writing***

In [None]:
def write_to_text_file(df, output_file):
    with open(output_file, 'w') as f:
        # Initialize a flag to keep track of the first line
        first_line = True

        # Variables to store article_id and appended_paragraph
        current_article_id = None
        appended_paragraph = ""

        # Iterate over DataFrame rows
        for index, row in df.iterrows():
            article_id = row['ARTICLE_ID']
            section_text = row['SECTION_TEXT']

            # Check if the section text is not a heading
            if section_text.strip() and not section_text.strip().startswith('='):
                # If current article ID is different from previous one, write the appended paragraph to file
                if article_id != current_article_id and current_article_id is not None:
                    if not first_line:
                        f.write(f"\n{current_article_id},{appended_paragraph}")  # Write the appended paragraph to file
                    else:
                        f.write(f"{current_article_id},{appended_paragraph}")
                        first_line = False  # Set first_line flag to False after writing the first line
                    appended_paragraph = ""  # Reset appended paragraph for the new article ID

                # Append section text to the paragraph
                appended_paragraph += section_text.strip() + '\n'

                # Update current article ID
                current_article_id = article_id

        # Write the last appended paragraph to file
        if appended_paragraph:
            if not first_line:
                f.write(f"\n{current_article_id},{appended_paragraph}")  # Write the appended paragraph to file
            else:
                f.write(f"{current_article_id},{appended_paragraph}")

# Example usage:
write_to_text_file(input_data_df, "input.txt")
