In [4]:
from tokenize import ContStr
import pandas as pd

# Đọc dữ liệu từ tệp CSV
df = pd.read_csv('selected.csv')

education_map = {
    1: "an unfinished compulsory education", 
    2: "a compulsory education diploma",
    3: "a vocational education diploma (Federal Certificate of Capacity – CFC)", 
    4: "a generalist secondary education diploma (3-year program, ECG)", 
    5: "a general baccalaureate diploma (Maturité)",
    6: "a Bachelor’s or Master’s degree from a university of applied sciences or professional school",
    7: "an academic Bachelor’s or Master’s degree from a university or institute of technology",
    8: "a doctoral degree (PhD)"
}

purpose_map = {
    1: "work", 2: "professional trip", 3: "studying", 4: "shopping", 
    5: "activity at home", 6: "eating/drinking", 
    7: "personal business", 8: "driving someone", 9: "cultural activity or sport", 
    10: "going out (with friends, restaurant, cinema, theater)", 11: "other"
}

caravail_map = {
    1: "always", 
    2: "sometime",
    3: "never"
}

income_map = {
    1: "less than 2500 CHF", 
    2: "from 2501 to 4000 CHF", 
    3: "from 4001 to 6000 CHF", 
    4: "from 6001 to 8000 CHF",
    5: "from 8001 to 10000 CHF", 
    6: "more than 10001 CHF"
}

male_map = {
    1: "male",
    2: "female"
}

# Hàm để tạo chuỗi văn bản từ một hàng dữ liệu
def create_text(row):
    purpose = purpose_map.get(int(row["DestAct"]))
    age = int(row["age"])
    income = income_map.get(int(row["Income"]))
    gender = male_map.get(int(row["Gender"]))
    education = education_map.get(int(row["Education"]))
    caravail = caravail_map.get(int(row["CarAvail"]))
    distance = float(row["distance_km"])
    car = int(row["NbCar"])
    motor = int(row["NbMoto"])
    bicycle = int(row["NbBicy"])
    timecar = int(row["TimeCar"])
    timept = int(row["TimePT"])
    costcar = float(row["CostCarCHF"])
    costpt = float(row["CostPT"])
    transfer = int(row["NbTransf"])
    if car > 1:
        car = f"{car} cars"
    else:
        car = f"{car} car"
    if motor > 1:
        motor = f"{motor} motorcycles"
    else:
        motor = f"{motor} motorcycle"
    if bicycle > 1:
        bicycle = f"{bicycle} bicycles"
    else:
        bicycle = f"{bicycle} bicycle"
    if transfer > 1:
        transfer = f"{transfer} transfers" 
    else:
        transfer = f"{transfer} transfer"
    return (f"The trip distance is {distance} km, and trip purpose is {purpose}. There are three travel mode for traveler to choose: "
            f"1. Public transport: travel time is {timept} minutes, and the travel cost is {costpt} CHF with a total of {transfer}. "
            f"2. Driving: travel time is {timecar} minutes, and the travel cost is {costcar} CHF. "
            f"3. Soft modes is free of charge. "
            f"Traveler is {age} year old, {gender}, with {education} and the person's household monthly income is {income}. "  
            f"The person's household owns {car}, {motor}, {bicycle} and {caravail} has a car to go out."
    )
#Test
print(create_text(df.iloc[0]))
# Tạo cột văn bản mới
df['INFOR'] = df.apply(create_text, axis=1)
df['CHOICE'] = df['Choice'].astype(int)


# # Lưu kết quả vào tệp CSV mới
df[['ID', 'INFOR', 'CHOICE']].to_csv('textdata.csv', index=False)

print("Dữ liệu đã được chuyển đổi và lưu vào 'text_data.csv'")

The trip distance is 30.0 km, and trip purpose is professional trip. There are three travel mode for traveler to choose: 1. Public transport: travel time is 85 minutes, and the travel cost is 12.4 CHF with a total of 4 transfers. 2. Driving: travel time is 32 minutes, and the travel cost is 4.54 CHF. 3. Soft modes is free of charge. Traveler is 27 year old, male, with an academic Bachelor’s or Master’s degree from a university or institute of technology and the person's household monthly income is from 6001 to 8000 CHF. The person's household owns 1 car, 0 motorcycle, 3 bicycles and sometime has a car to go out.
Dữ liệu đã được chuyển đổi và lưu vào 'text_data.csv'


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Đọc dữ liệu từ file
df = pd.read_csv("textdata.csv")  # Đảm bảo file nằm cùng thư mục với script

temp_df, test_df = train_test_split(df, test_size=0.2, stratify=df['CHOICE'], random_state=42)


temp_df.to_csv("train.csv", index=False) 
test_df.to_csv("test.csv", index=False)

# In thông tin về kích thước các tập dữ liệu
print(f"Tổng số mẫu: {len(df)}")
print(f"Train set: {len(temp_df)} mẫu ({len(temp_df)/len(df)*100:.1f}%)")
print(f"Test set: {len(test_df)} mẫu ({len(test_df)/len(df)*100:.1f}%)")

Tổng số mẫu: 1523
Train set: 1218 mẫu (80.0%)
Test set: 305 mẫu (20.0%)
