<a href="https://colab.research.google.com/github/thatswhatmeetcoded/Sentiment-Classification/blob/main/decision_tree/1_data_loader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Function to convert Google Drive shareable links to direct download links
def gdrive_to_direct_link(url):
    file_id = url.split('/d/')[1].split('/')[0]
    return f"https://drive.google.com/uc?id={file_id}"

# Google Drive shared file links
file_map = {
    "train": "https://drive.google.com/file/d/1LCZh35uWEZPArfk6RLDZL5_BiehIsq_1/view?usp=share_link",
    "test": "https://drive.google.com/file/d/19P-r7Y1opc4c4WrfPf4lrzE7_U2M3div/view?usp=share_link",
    "manual_test": "https://drive.google.com/file/d/1M4aP9_JG3V2kEoL1hJjYy0D--cAhG3-0/view?usp=share_link",
    "full_train": "https://drive.google.com/file/d/16WFbOrZYcdgj_nBVmbm6YO1umfTot6qh/view?usp=share_link"
}

# Column names for the datasets
cols = ['polarity', 'id', 'date', 'query', 'user', 'text']

# Load and merge datasets
def load_and_merge_data(file_map, cols):
    dfs = []
    for label, url in file_map.items():
        direct_link = gdrive_to_direct_link(url)
        df = pd.read_csv(direct_link, encoding='latin-1', header=None, names=cols, dtype=str, low_memory=False)
        df['source'] = label
        dfs.append(df)
    df_all = pd.concat(dfs, ignore_index=True)

    # Keep only 0, 2, 4 sentiments
    df_all = df_all[df_all['polarity'].isin(['0', '2', '4'])]
    df_all['sentiment'] = df_all['polarity'].map({'0': 'negative', '2': 'neutral', '4': 'positive'})

    # Drop rows where text is missing
    df_all = df_all.dropna(subset=['text'])

    return df_all

# Load and preview merged data
df_all = load_and_merge_data(file_map, cols)


from google.colab import drive
drive.mount('/content/drive')

# Define the path where you want to save in your Google Drive
save_path = '/content/drive/MyDrive/raw_combined_data.csv'

# Save the combined dataset to Drive
df_all.to_csv(save_path, index=False)

print(f"Saved combined dataset to: {save_path}")

# Display shape and head
print(f"Final dataset shape: {df_all.shape}")
df_all.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved combined dataset to: /content/drive/MyDrive/raw_combined_data.csv
Final dataset shape: (516, 8)


Unnamed: 0,polarity,id,date,query,user,text,source,sentiment
32298,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...,manual_test,positive
32299,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...,manual_test,positive
32300,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck...",manual_test,positive
32301,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...,manual_test,positive
32302,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...,manual_test,positive
