In [1]:
import pandas as pd

# Load the original files
my_scrap_path = '../final_article.csv'  # Path to your MyScrap.csv file
multi_pop_path = '../multiPoP.csv'  # Path to your multiPoP.csv file
my_scrap_df = pd.read_csv(my_scrap_path)
multi_pop_df = pd.read_csv(multi_pop_path)

# Define the columns to include in the output
output_columns = [
    'Cites', 'Authors', 'Title', 'Year', 'Source', 'Publisher', 'ArticleURL',
    'CitesURL', 'GSRank', 'QueryDate', 'Type', 'DOI', 'ISSN', 'CitationURL', 
    'Volume', 'Issue', 'StartPage', 'EndPage', 'ECC', 'CitesPerYear', 
    'CitesPerAuthor', 'AuthorCount', 'Age', 'Abstract', 'FullTextURL', 'RelatedURL'
]

# Define a function to transform MyScrap.csv to match multiPoP.csv format
def transform_my_scrap_to_multipop(my_scrap_df, output_columns):
    # Define column mapping based on observed structure
    column_mapping = {
        'title': 'Title',
        'year': 'Year',
        'cite': 'Cites',
        'main_authors': 'Authors',
        'link_ids_x': 'ArticleURL',  # Assuming link_ids_x provides article links
        'main_subject': 'Source'     # Assuming main_subject is analogous to Source
    }
    
    # Rename columns in my_scrap_df according to mapping
    transformed_df = my_scrap_df.rename(columns=column_mapping)
    
    # Set Year to float as seen in multiPoP.csv
    transformed_df['Year'] = transformed_df['Year'].astype(float)
    
    # Add any missing columns in my_scrap_df to match output_columns structure
    missing_columns = set(output_columns) - set(transformed_df.columns)
    for col in missing_columns:
        transformed_df[col] = ""  # Fill missing columns with empty strings
    
    # Select and reorder columns to match output_columns
    transformed_df = transformed_df[output_columns]
    
    return transformed_df

# Transform my_scrap_df to match specified output_columns
transformed_df = transform_my_scrap_to_multipop(my_scrap_df, output_columns)

In [2]:
# Save the transformed DataFrame to a new CSV file
transformed_df.to_csv('Transformed_pop_articles.csv', index=False)