In [None]:
import os
import re
import zipfile
import pandas as pd
import numpy as np
import textstat
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.utils import resample

# Ensure required NLTK resources are available
nltk.download('punkt')

In [47]:
# -------------------- Utility Functions -------------------- #

def compute_readability_scores(text):
    """Compute readability scores for a given text."""
    if pd.isna(text) or not isinstance(text, str) or text.strip() == "":
        return [None] * 6  # Return None values if text is missing
    return [
        textstat.flesch_reading_ease(text),
        textstat.flesch_kincaid_grade(text),
        textstat.gunning_fog(text),
        textstat.smog_index(text),
        textstat.dale_chall_readability_score(text),
        textstat.automated_readability_index(text)
    ]


def extract_and_clean_zip(zip_folder_path, extract_folder_path):
    """Extracts ZIP files containing CSVs, removes the first row, and returns a list of DataFrames."""
    os.makedirs(extract_folder_path, exist_ok=True)
    dataframes = []
    
    for zip_file in os.listdir(zip_folder_path):
        if zip_file.endswith(".zip"):
            zip_path = os.path.join(zip_folder_path, zip_file)
            with zipfile.ZipFile(zip_path, 'r') as zf:
                for file in zf.namelist():
                    if file.endswith(".csv"):
                        with zf.open(file) as extracted_file:
                            df = pd.read_csv(extracted_file, skiprows=1)  # Remove the first row
                        dataframes.append(df)
    return dataframes


def process_domain(zip_folder, extract_folder, output_name, domain, base_dir, columns_to_keep):
    """Processes a domain by extracting data, cleaning it, computing readability scores, and saving it."""
    zip_folder_path = os.path.join(base_dir, zip_folder)
    extract_folder_path = os.path.join(base_dir, extract_folder)
    cleaned_csv_path = os.path.join(extract_folder_path, f"{output_name}.csv")
    
    dataframes = extract_and_clean_zip(zip_folder_path, extract_folder_path)
    if not dataframes:
        print(f"No CSV files found in {zip_folder}")
        return None
    
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    if "DOI" in combined_df.columns:
        combined_df = combined_df.drop_duplicates(subset="DOI", keep="first")
    
    required_columns = ["Abstract", "Title", "FCR"]
    combined_df = combined_df.dropna(subset=[col for col in required_columns if col in combined_df.columns]).reset_index(drop=True)
    
    available_columns = [col for col in columns_to_keep if col in combined_df.columns]
    combined_df = combined_df[available_columns]
    
    if "Abstract" in combined_df.columns:
        readability_scores = combined_df["Abstract"].apply(compute_readability_scores)
        readability_df = pd.DataFrame(
            readability_scores.tolist(),
            columns=[
                "Flesch Reading Ease", "Flesch-Kincaid Grade Level", "Gunning Fog Index",
                "SMOG Index", "Dale-Chall Readability Score", "Automated Readability Index"
            ]
        )
        combined_df = pd.concat([combined_df, readability_df], axis=1)
    
    combined_df["Domain"] = domain
    combined_df.to_csv(cleaned_csv_path, index=False)
    return cleaned_csv_path


def merge_cleaned_data(cleaned_file_paths, final_output_path):
    """Merges all cleaned datasets into one final dataset and removes duplicate DOIs."""
    dataframes = [pd.read_csv(file) for file in cleaned_file_paths]
    final_df = pd.concat(dataframes, ignore_index=True)
    
    if "DOI" in final_df.columns:
        final_df = final_df.drop_duplicates(subset="DOI", keep="first")
    
    final_df.to_csv(final_output_path, index=False)
    return final_df


def ecdf_normalization(values):
    """Computes ECDF normalization for a given array of values."""
    sorted_vals = np.sort(values)
    ecdf_scores = np.searchsorted(sorted_vals, values, side='right') / len(sorted_vals)
    return ecdf_scores.round(3)


def add_ecdf_normalization(df):
    """Adds ECDF normalization to the dataset per domain and year."""
    df['ECDF_FCR'] = df.groupby(['Domain', 'PubYear'])['FCR'].transform(ecdf_normalization)
    return df


def clean_text(text):
    """Cleans text by lowercasing and removing special characters."""
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s.,]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def filter_short_abstracts(df):
    """Removes abstracts that have fewer than 100 tokens."""
    df["Token Count"] = df["Abstract"].apply(lambda x: len(word_tokenize(x)) if isinstance(x, str) else 0)
    return df[df["Token Count"] >= 100].drop(columns=["Token Count"])

def remove_empty_titles_abstracts(df):
    """Removes rows where 'Title' or 'Abstract' is empty, NaN, or Title has fewer than 4 tokens."""
    df = df.dropna(subset=["Title", "Abstract"])
    df["Title Token Count"] = df["Title"].apply(lambda x: len(word_tokenize(x)) if isinstance(x, str) else 0)
    return df[df["Title Token Count"] >= 4].drop(columns=["Title Token Count"])



def preprocess_dataset(df):
    """Applies text cleaning, computes readability, and filters data."""
    df['Title'] = df['Title'].apply(clean_text)
    df['Abstract'] = df['Abstract'].apply(clean_text)
    return df


def categorize_fcr(df):
    """Categorizes ECDF_FCR into three bins: Low, Medium, and High."""
    bins = [0, 0.33, 0.67, 1]
    labels = ["Low", "Medium", "High"]
    df["ECDF_FCR_Category"] = pd.cut(df["ECDF_FCR"], bins=bins, labels=labels, include_lowest=True)
    return df


def balance_dataset(df):
    """Balances dataset by undersampling each category to match the smallest group."""

    category_counts = df["ECDF_FCR_Category"].value_counts()
         
    # Print category counts
    print("Category Distribution Before Balanced:\n", category_counts)

    
    min_count = df["ECDF_FCR_Category"].value_counts().min()
    #df_balanced = df.groupby("ECDF_FCR_Category").apply(lambda x: x.sample(min_count)).reset_index(drop=True)

    df_balanced = df.groupby("ECDF_FCR_Category", observed=False, group_keys=False).apply(
        lambda x: x.sample(min_count)
    ).reset_index(drop=True)[df.columns]  # Ensures original column structure

    
    # Verify new distribution
    print("Balanced Category Distribution:\n", df_balanced["ECDF_FCR_Category"].value_counts())
    return df_balanced

In [3]:
# -------------------- Main Execution -------------------- #

base_dir = "../Dataset Creation"
final_combined_csv_path = os.path.join(base_dir, "2018-2022 Data.csv")

zip_folders = ["zip folders/CS_zip", "zip folders/Eng_zip", "zip folders/Math_zip", "zip folders/Psych_zip", "zip folders/Physical Sci_zip"]
extract_folders = ["computer science", "engineering", "math", "psychology", "physical science"]
combined_output = ["computer science data", "engineering data", "math data", "psychology data", "physical science data"]
domains = ["Computer Science", "Engineering", "Mathematics", "Psychology", "Physical Science"]

columns_to_keep = ["Publication ID", "DOI", "Title", "Abstract", "FCR", "PubYear"]
cleaned_file_paths = []

for zip_folder, extract_folder, output_name, domain in zip(zip_folders, extract_folders, combined_output, domains):
    cleaned_path = process_domain(zip_folder, extract_folder, output_name, domain, base_dir, columns_to_keep)
    if cleaned_path:
        cleaned_file_paths.append(cleaned_path)

final_df1 = merge_cleaned_data(cleaned_file_paths, final_combined_csv_path)
final_df1.to_csv(final_combined_csv_path, index=False)

In [4]:
len(final_df1)

45231

In [5]:
final_df2 = add_ecdf_normalization(final_df1)
final_df2.to_csv(final_combined_csv_path, index=False)

In [6]:
len(final_df2)

45231

In [7]:
final_df3 = filter_short_abstracts(final_df2)
final_df3.to_csv(final_combined_csv_path, index=False)

In [8]:
len(final_df3)

41772

In [9]:
final_df4 = categorize_fcr(final_df3)
final_df4.to_csv(final_combined_csv_path, index=False)

In [10]:
len(final_df4)

41772

In [None]:
final_df5 = balance_dataset(final_df4)
final_df5.to_csv(final_combined_csv_path, index=False) 

In [28]:
len(final_df5)

38634

In [29]:
final_df6 = preprocess_dataset(final_df5)
final_df6.to_csv(final_combined_csv_path, index=False)

In [30]:
len(final_df6)

38634

In [None]:
final_df7 = remove_empty_titles_abstracts(final_df6)
final_df7.to_csv(final_combined_csv_path, index=False)

In [32]:
len(final_df7)

38034

In [None]:
final_df8 = balance_dataset(final_df7)
final_df8.to_csv(final_combined_csv_path, index=False)

In [49]:
len(final_df8)

37827