In [1]:
import pandas as pd

def load_farm_ads_data(text_file: str, vector_file: str):
    # Dictionary to store index:value pairs
    text_data = []
    # Reads the data and seperates the label and text
    with open(text_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if not parts:  # skip empty lines
                continue
            label = int(parts[0])
            text = ' '.join(parts[1:])
            text_data.append({'label': label, 'text': text})
    
    # Create pandas DataFrame where each item is a dictionary (key & value)
    text_df = pd.DataFrame(text_data)
    labels = text_df['label'].values
    texts = text_df['text'].values
    
    vector_data = []
    with open(vector_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if not parts:  # skip empty lines
                continue
            label = int(parts[0])
            features = {}
            for item in parts[1:]:
                idx, val = item.split(':')
                # '3:1' to idx = '3', val = '1'
                features[int(idx)] = float(val)
            vector_data.append(features)
    
    # Convert dictionaries into pandas DataFrame
    vector_df = pd.DataFrame(vector_data).fillna(0)
    
    return texts, labels, vector_df

if __name__ == '__main__':
    # File paths
    text_file = "farm-ads"
    vector_file = "farm-ads-vect"
    
    # Load data
    texts, labels, vector_data = load_farm_ads_data(text_file, vector_file)
    
    # Display results
    print("\nFirst 5 text samples with labels:")
    for i in range(5):
        print(f"Label: {labels[i]}\tText: {texts[i][:50]}...")
    
    print("\nVector data shape:", vector_data.shape)
    print("First 5 rows of vector data (non-zero features only):")
    # Get column wise + dictionary of non-zero features
    print(vector_data.iloc[:5].apply(lambda x: x[x != 0].to_dict(), axis=1))


First 5 text samples with labels:
Label: 1	Text: ad-jerry ad-bruckheimer ad-chase ad-premier ad-sep...
Label: -1	Text: ad-rheumatoid ad-arthritis ad-expert ad-tip ad-inf...
Label: -1	Text: ad-rheumatologist ad-anju ad-varghese ad-yonker ad...
Label: -1	Text: ad-siemen ad-water ad-remediation ad-water ad-scar...
Label: -1	Text: ad-symptom ad-muscle ad-weakness ad-genetic ad-dis...

Vector data shape: (4143, 54877)
First 5 rows of vector data (non-zero features only):
0    {1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1....
1    {10: 1.0, 11: 1.0, 12: 1.0, 13: 1.0, 14: 1.0, ...
2    {29: 1.0, 31: 1.0, 35: 1.0, 101: 1.0, 131: 1.0...
3    {34: 1.0, 35: 1.0, 36: 1.0, 44: 1.0, 54: 1.0, ...
4    {8: 1.0, 9: 1.0, 429: 1.0, 430: 1.0, 431: 1.0,...
dtype: object
