In [22]:
!pip install rank-bm25 faiss-cpu sentence-transformers transformers arxiv spacy nltk
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 660.6 kB/s eta 0:00:20
     ---------------------------------------- 0.1/12.8 MB 1.1 MB/s eta 0:00:12
      --------------------------------------- 0.2/12.8 MB 1.2 MB/s eta 0:00:11
      --------------------------------------- 0.3/12.8 MB 1.6 MB/s eta 0:00:09
     - -------------------------------------- 0.4/12.8 MB 2.0 MB/s eta 0:00:07
     -- ------------------------------------- 0.7/12.8 MB 2.6 MB/s eta 0:00:05
     --- ------------------------------------ 1.0/12.8 MB 2.9 MB/s eta 0:00:05
     --- ------------------------------------ 1.2/12.8 MB 3.2 MB/s eta 0:00:04
     ---- ----------------------------------- 1.4/12.8 MB 3.4 MB/s eta 0:00:04
     ----- -----------------------------

In [23]:
import pandas as pd
import numpy as np
import re
import nltk
import arxiv
import time
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [24]:
nltk.download('punkt', force=True)
nltk.download('wordnet', force=True)
nltk.download('stopwords', force=True)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dubey\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dubey\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dubey\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [25]:
def fetch_arxiv_cs_papers(total_papers=1000, batch_size=100):
    client = arxiv.Client()
    all_papers = []
    
    for start in tqdm(range(0, total_papers, batch_size)):
        search = arxiv.Search(
            query="cat:cs.*",
            max_results=batch_size,
            sort_by=arxiv.SortCriterion.SubmittedDate,
            sort_order=arxiv.SortOrder.Descending
        )
        
        try:
            results = client.results(search)
            batch = []
            for paper in results:
                batch.append({
                    "id": paper.entry_id.split('/')[-1],
                    "title": paper.title,
                    "authors": [a.name for a in paper.authors],
                    "abstract": paper.summary.replace('\n', ' '),
                    "published": paper.published.date(),
                    "categories": paper.categories,
                    "pdf_url": paper.pdf_url
                })
            all_papers.extend(batch)
            time.sleep(1.5)
        except Exception as e:
            print(f"Error at start={start}: {str(e)}")
            continue
    
    return pd.DataFrame(all_papers)


In [26]:
def preprocess_data(df):
    # Make sure required resources are downloaded
    nltk.download('punkt', force=True)
    nltk.download('wordnet', force=True)
    nltk.download('stopwords', force=True)
    
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    def preprocess_text(text):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        tokens = nltk.word_tokenize(text)
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        return ' '.join(tokens)

    df['processed_abstract'] = df['abstract'].apply(preprocess_text)
    return df

In [27]:
# Execute data pipeline
if __name__ == '__main__':
    # Fetch and save raw data
    raw_df = fetch_arxiv_cs_papers(total_papers=1000)
    raw_df.to_csv('arxiv_cs_raw.csv', index=False)
    
    # Preprocess and save cleaned data
    cleaned_df = preprocess_data(raw_df)
    cleaned_df.to_csv('arxiv_cs_processed.csv', index=False)
    print("Data processing completed!")

100%|██████████| 10/10 [00:49<00:00,  4.91s/it]
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dubey\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dubey\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dubey\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Data processing completed!
