In [5]:
import os
import pandas as pd
import ast

In [6]:
csv_directory = './csv_azure/' 

In [7]:
# Combine all CSV files
def combine_csv_files(directory):
    combined_df = pd.DataFrame()
    for file in os.listdir(directory):
        if file.startswith('nexus_mods_mods') and file.endswith('.csv'):
            file_path = os.path.join(directory, file)
            temp_df = pd.read_csv(file_path)
            combined_df = pd.concat([combined_df, temp_df], ignore_index=True)
    return combined_df

# Extract nested data 
def extract_nested_columns(df, column_name):
    if column_name in df.columns:
        # Parse the nested data from strings to dictionaries
        df[column_name] = df[column_name].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
        nested_df = pd.json_normalize(df[column_name])
        nested_df.columns = [f"{column_name}_{col}" for col in nested_df.columns]  # Prefix with original column name
        df = pd.concat([df.drop(columns=[column_name]), nested_df], axis=1)
    return df

def process_csv_data(directory):
    # Combine all matching CSV files
    combined_df = combine_csv_files(directory)
    print(f"Combined Data Shape: {combined_df.shape}")
    
    # Remove duplicates based on `mod_id` and `game_id`
    combined_df.drop_duplicates(subset=['mod_id', 'game_id'], inplace=True)
    print(f"After Removing Duplicates: {combined_df.shape}")
    
    # Check and process columns with nested data
    nested_columns = ['member_info']  # Update this list based on your column names with nested data
    for col in nested_columns:
        if col in combined_df.columns:
            combined_df = extract_nested_columns(combined_df, col)
    
    return combined_df

In [8]:
cleaned_data = process_csv_data(csv_directory)

# Save the cleaned data to Parquet format (or Feather for even faster access)
output_parquet_path = './csv_azure/combined_cleaned.parquet'  # Update to your desired output path
cleaned_data.to_parquet(output_parquet_path, index=False)
print(f"Cleaned data saved to {output_parquet_path}")


Combined Data Shape: (34612, 27)
After Removing Duplicates: (31074, 27)
Cleaned data saved to ./csv_azure/combined_cleaned.parquet


In [9]:
import pandas as pd

df = pd.read_parquet(output_parquet_path)

print(df.head())

print(df.columns)

print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")


                                                name  \
0             The Cracking City - Ayleid Player Base   
1                                 Tamriel y mas alla   
2  Unofficial Skyrim Special Edition Patch - Germ...   
3                                     Blessing's Bay   
4                                       Miriam's Way   

                                             summary  \
0  A completely functional player base added, ent...   
1  Una aventura completamente nueva que abarca to...   
2  Die deutsche Übersetzung des Unofficial Skyrim...   
3  A lighthouse player home, unfurnished, just no...   
4                  A Solitude farmhouse unfurnished.   

                                         description  \
0  The resources used here are from [url=http://w...   
1     HOY SE SUBIO UN ARCHIVO DE MUESTRA, LA BASE...   
2  [center][/center]\n<br />\n<br />[color=#a4c2f...   
3  [left][/left][center]\r<br />[font=Comic Sans ...   
4   \r<br />[center][i][font=Comic Sans MS][co

In [12]:
import pandas as pd
import pyodbc



In [21]:
# Read the Parquet file
parquet_file = './csv_azure/combined_cleaned.parquet'
data = pd.read_parquet(parquet_file)
data.rename(columns={"user": "user_info"}, inplace=True)

import pandas as pd
from sqlalchemy import create_engine

# Connect to SQL Server
engine = create_engine(
    "mssql+pyodbc://admin4327:Tr3m3r3Pr1nc3!@nmntserver.database.windows.net/NexusModsDB?driver=ODBC+Driver+17+for+SQL+Server"
)

# Upload the data in bulk
data.to_sql('Mods', engine, if_exists='append', index=False, chunksize=10000)
print("Data uploaded successfully!")



Data uploaded successfully!


In [17]:
# Identify columns with excessively large values
for col in ['uid', 'mod_downloads', 'mod_unique_downloads', 'created_timestamp', 'updated_timestamp']:
    max_value = data[col].max()
    print(f"Max value in {col}: {max_value}")


Max value in uid: 7318624302218
Max value in mod_downloads: 20657302
Max value in mod_unique_downloads: 5320814
Max value in created_timestamp: 1735489981
Max value in updated_timestamp: 1737723578


In [26]:
csv_file = 'nexus_mods_games.csv'  # Replace with the path to your CSV file
data = pd.read_csv(csv_file)

# Connect to SQL Server
engine = create_engine(
    "mssql+pyodbc://admin4327:Tr3m3r3Pr1nc3!@nmntserver.database.windows.net/NexusModsDB?driver=ODBC+Driver+17+for+SQL+Server"
)

# Upload the data in bulk
data.to_sql('Games', engine, if_exists='append', index=False, chunksize=10000)
print("Data uploaded successfully!")

Data uploaded successfully!


In [23]:
print(data.columns)

Index(['id', 'name', 'forum_url', 'nexusmods_url', 'genre', 'file_count',
       'downloads', 'domain_name', 'approved_date', 'file_views', 'authors',
       'file_endorsements', 'mods', 'categories'],
      dtype='object')


In [25]:
for col in ['id', 'file_count', 'downloads', 'approved_date', 'file_views']:
    print(f"Max value in {col}: {data[col].max()}")


Max value in id: 3474
Max value in file_count: 566525
Max value in downloads: 4294967294
Max value in approved_date: 1607433331
Max value in file_views: 5692770596
