In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

In [7]:
train_data = pd.read_csv("data/Train.csv")
test_data = pd.read_csv("data/Test.csv")

numeric_columns = train_data.select_dtypes(include=[np.number]).columns.tolist()
test_data_numeric_columns = test_data.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = train_data.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric columns:", numeric_columns)
print("Test data numeric columns:", test_data_numeric_columns)
print("Categorical columns:", categorical_columns)

numeric_imputer = SimpleImputer()
categorical_imputer = SimpleImputer(strategy="most_frequent")

train_data[numeric_columns] = numeric_imputer.fit_transform(train_data[numeric_columns])
test_data[test_data_numeric_columns] = numeric_imputer.fit_transform(test_data[test_data_numeric_columns])

train_data[categorical_columns] = categorical_imputer.fit_transform(train_data[categorical_columns])
test_data[categorical_columns] = categorical_imputer.fit_transform(test_data[categorical_columns])

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

Numeric columns: ['total_female', 'total_male', 'night_mainland', 'night_zanzibar', 'total_cost']
Test data numeric columns: ['total_female', 'total_male', 'night_mainland', 'night_zanzibar']
Categorical columns: ['ID', 'country', 'age_group', 'travel_with', 'purpose', 'main_activity', 'info_source', 'tour_arrangement', 'package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz', 'package_sightseeing', 'package_guided_tour', 'package_insurance', 'payment_mode', 'first_trip_tz', 'most_impressing']
Train data shape: (4809, 23)
Test data shape: (1601, 22)


In [8]:
train_data.head()

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,...,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost
0,tour_0,SWIZERLAND,45-64,Friends/Relatives,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Friends, relatives",Independent,...,No,No,No,No,13.0,0.0,Cash,No,Friendly People,674602.5
1,tour_10,UNITED KINGDOM,25-44,Alone,1.0,0.0,Leisure and Holidays,Cultural tourism,others,Independent,...,No,No,No,No,14.0,7.0,Cash,Yes,"Wonderful Country, Landscape, Nature",3214906.5
2,tour_1000,UNITED KINGDOM,25-44,Alone,0.0,1.0,Visiting Friends and Relatives,Cultural tourism,"Friends, relatives",Independent,...,No,No,No,No,1.0,31.0,Cash,No,Excellent Experience,3315000.0
3,tour_1002,UNITED KINGDOM,25-44,Spouse,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,...,Yes,Yes,Yes,No,11.0,0.0,Cash,Yes,Friendly People,7790250.0
4,tour_1004,CHINA,1-24,Alone,1.0,0.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Independent,...,No,No,No,No,7.0,4.0,Cash,Yes,No comments,1657500.0


```python
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 1. Load and inspect data
df = pd.read_csv('competitions/tanzania-tourism-prediction/data/Train.csv')

# 2. Data preprocessing
# Handle missing values
df = df.fillna(0)  # Or use more sophisticated imputation

# 3. Feature engineering
# Convert categorical variables to numeric using label encoding
categorical_columns = ['country', 'age_group', 'travel_with', 'purpose', 
                      'main_activity', 'info_source', 'tour_arrangement',
                      'payment_mode', 'most_impressing']

le = LabelEncoder()
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])

# Create binary features from package columns
package_columns = [col for col in df.columns if col.startswith('package_')]
df[package_columns] = df[package_columns].map({'Yes': 1, 'No': 0})

# 4. Feature selection
features = categorical_columns + package_columns + ['total_female', 'total_male', 
           'night_mainland', 'night_zanzibar']

X = df[features]
y = df['total_cost']

# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Scale numerical features
scaler = StandardScaler()
numerical_cols = ['total_female', 'total_male', 'night_mainland', 'night_zanzibar']
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# 7. Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 8. Make predictions and evaluate
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))

print(f'Train RMSE: {train_rmse:.2f}')
print(f'Test RMSE: {test_rmse:.2f}')

# 9. Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))
```