In [15]:
import os
import pandas as pd

folder_path = '.'  # Use current directory
script_name = 'preprocessing.ipynb'  # Name of the script to exclude

merged_data = pd.DataFrame()

for filename in os.listdir(folder_path):
    if (
        filename.startswith('articles_data') and
        'articles_links' not in filename and
        filename != script_name
    ):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        merged_data = pd.concat([merged_data, df], ignore_index=True)

merged_data.to_csv(os.path.join(folder_path, 'merged_articles_data.csv'), index=False) 

In [16]:
df = pd.read_csv('merged_articles_data.csv')
df.shape


  df = pd.read_csv('merged_articles_data.csv')


(2334762, 5)

In [17]:
df.head()

Unnamed: 0,url,title,abstract,introduction,conclusion
0,https://link.springer.com/article/10.1007/BF01...,Einladung,,,
1,https://link.springer.com/article/10.1007/BF01...,Bemerkungen zu E. A. Feoktistow's “Einige Wort...,,,
2,https://link.springer.com/article/10.1007/BF01...,Entgegnung,,,
3,https://link.springer.com/article/10.1007/BF01...,Zei weitere Fälle von Kaiserschnitt (Nr. 5 und...,,,
4,https://link.springer.com/article/10.1007/BF01...,Monsieur le professeur!,,,


In [18]:
df = df.drop(columns=['introduction', 'conclusion'])


In [19]:
df = df.drop_duplicates(subset='url')

In [20]:
df.shape

(2197457, 3)

In [21]:
df = df.dropna(subset=['abstract'])
df.shape

(1668188, 3)

In [22]:
df.head()

Unnamed: 0,url,title,abstract
881,https://link.springer.com/article/10.1007/s007...,Novel carbon material with potential applicati...,AbstractLead-acid batteries (LABs) are one of ...
882,https://link.springer.com/article/10.1007/s007...,Specific conductivities of tetraalkylammonium ...,AbstractSearching for environmentally friendly...
883,https://link.springer.com/article/10.1007/s007...,Development of electrochemical sensor for quan...,AbstractThis study presents the development of...
884,https://link.springer.com/article/10.1007/s007...,Synthesis of thioamides from Schiff bases and ...,AbstractThioamides play an important role in p...
886,https://link.springer.com/article/10.1007/s007...,Mechanistic study and computational analysis o...,AbstractThe mechanism of electrochemical oxida...


In [23]:
# Remove rows with missing values in the 'title' column
df = df.dropna(subset=['title'])

# Remove leading and trailing whitespace from 'title' and 'abstract' columns
df['title'] = df['title'].str.strip()
df['abstract'] = df['abstract'].str.strip()

# Display the first few rows of the preprocessed dataframe
df.shape

(1668187, 3)

In [24]:
import re
from bs4 import BeautifulSoup

# Function to clean text
def clean_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

# Apply the clean_text function to 'title' and 'abstract' columns
df['title'] = df['title'].apply(clean_text)
df['abstract'] = df['abstract'].apply(clean_text)

# Display the first few rows of the cleaned dataframe
df.head()

  text = BeautifulSoup(text, "html.parser").get_text()


Unnamed: 0,url,title,abstract
881,https://link.springer.com/article/10.1007/s007...,Novel carbon material with potential applicati...,AbstractLead-acid batteries (LABs) are one of ...
882,https://link.springer.com/article/10.1007/s007...,Specific conductivities of tetraalkylammonium ...,AbstractSearching for environmentally friendly...
883,https://link.springer.com/article/10.1007/s007...,Development of electrochemical sensor for quan...,AbstractThis study presents the development of...
884,https://link.springer.com/article/10.1007/s007...,Synthesis of thioamides from Schiff bases and ...,AbstractThioamides play an important role in p...
886,https://link.springer.com/article/10.1007/s007...,Mechanistic study and computational analysis o...,AbstractThe mechanism of electrochemical oxida...


In [25]:
df.to_csv('preprocessed_data.csv', index=False)