In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import precision_score
import joblib

In [27]:

class EmailTimingPredictor:
    def __init__(self):
        self.customer_features = None
        self.email_history = None
        self.model = None
        self.customer_codes = None
        
    def load_data(self, customer_csv_path, history_csv_path):
        self.customer_features = pd.read_csv(customer_csv_path, low_memory=False)
        self.email_history = pd.read_csv(history_csv_path, low_memory=False)
        
    def prepare_training_data(self):
        email_agg = self.email_history.groupby('CUSTOMER_CODE').agg({
            'send_timestamp': lambda x: x.tolist(),
            'open_timestamp': lambda x: x.tolist()
        }).reset_index()
        target_columns = [f'timeslot_{i}' for i in range(28)]
        def create_target_vector(opened_slots):
            target = [0] * 28
            if opened_slots and len(opened_slots) > 0:
                for slot in opened_slots:
                    if 0 <= slot < 28:
                        target[slot] = 1
            return target
        email_agg['target'] = email_agg['open_timestamp'].apply(create_target_vector)
        merged_data = self.customer_features.copy()
        
        merged_data = merged_data.merge(
            email_agg[['CUSTOMER_CODE', 'target']], 
            on='CUSTOMER_CODE', 
            how='left'
        )
        # target_mode = email_agg['target'].mode().iloc[0] if not email_agg['target'].mode().empty else [0] * 28
        merged_data['target'] = merged_data['target'].fillna(str([0] * 28))
        merged_data['target'] = merged_data['target'].apply(
            lambda x: eval(x) if isinstance(x, str) else x
        )
        
        self.customer_codes = merged_data['CUSTOMER_CODE']
        X = merged_data.drop(['CUSTOMER_CODE', 'target'], axis=1)
        y = pd.DataFrame(merged_data['target'].tolist(), columns=target_columns)
        
        return X, y
    
    def train_model(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        base_classifier = RandomForestClassifier(
            n_estimators=300, 
            max_depth=20,
            min_samples_split=5,
            n_jobs=-1,
            random_state=42
        )
        self.model = MultiOutputClassifier(base_classifier)
        self.model.fit(X_train, y_train)
        y_pred = self.model.predict(X_test)
        precisions = [precision_score(y_test.iloc[:, i], y_pred[:, i]) 
                      for i in range(28)]
        
        for i, precision in enumerate(precisions):
            print(f"Timeslot {i} Precision: {precision:.4f}")
        
    def predict_best_timeslots(self, X):
        if self.model is None:
            raise ValueError("Model not trained. Call train_model first.")
        predictions = self.model.predict_proba(X)
        
        rankings = []
        for row_pred in zip(*predictions):
            time_slot_probs = [pred[0] for pred in row_pred]
            sorted_timeslots = sorted(
                range(28), 
                key=lambda i: time_slot_probs[i], 
                reverse=True
            )
            
            ranked_slots = [f'slot_{slot+1}' for slot in sorted_timeslots]
            rankings.append(ranked_slots)
        
        return rankings
    
    def create_submission(self, X, test_customers_path='Test/test_customers.csv'):
        rankings = self.predict_best_timeslots(X)
        submission_df = pd.DataFrame({
            'customer_code': self.customer_codes,
            'predicted_slots_order': rankings
        })
        customer_id_df = pd.read_csv(test_customers_path)
        customer_ids = customer_id_df['CUSTOMER_CODE'].tolist()
        missing_ids = set(customer_ids) - set(submission_df['customer_code'])
        if missing_ids:
            print(f"Warning: The following customer IDs are missing from the submission data: {missing_ids}")
        filtered_submission_df = submission_df[submission_df['customer_code'].isin(customer_ids)]
        filtered_submission_df = filtered_submission_df.set_index('customer_code').reindex(customer_ids).reset_index()
        filtered_submission_df.to_csv('submission.csv', index=False)
        return filtered_submission_df

    
    def save_model(self, filepath='email_timing_model.joblib'):
        save_data = {
            'model': self.model,
            'scaler': self.scaler,
            'customer_codes': self.customer_codes
        }
        joblib.dump(save_data, filepath)
        print(f"Model saved to {filepath}")
    
    def load_model(self, filepath='email_timing_model.joblib'):
        saved_data = joblib.load(filepath)
        self.model = saved_data['model']
        self.scaler = saved_data['scaler']
        self.customer_codes = saved_data['customer_codes']
        print(f"Model loaded from {filepath}")
    
    def fine_tune(self, new_X, new_y):
        for i, estimator in enumerate(self.model.estimators_):
            estimator.fit(new_X, new_y.iloc[:, i])

In [28]:
predictor = EmailTimingPredictor()

In [29]:
predictor.load_data('CDNA.csv', 'HISTORY.csv')
X, y = predictor.prepare_training_data()

In [30]:
print(X.shape)

(220699, 110)


In [31]:
predictor.train_model(X, y)
predictor.save_model('doraemon.joblib')

Timeslot 0 Precision: 0.9820
Timeslot 1 Precision: 0.5166
Timeslot 2 Precision: 0.5560
Timeslot 3 Precision: 0.5802
Timeslot 4 Precision: 0.5455
Timeslot 5 Precision: 0.5611
Timeslot 6 Precision: 0.5893
Timeslot 7 Precision: 0.5945
Timeslot 8 Precision: 0.4118
Timeslot 9 Precision: 0.4375
Timeslot 10 Precision: 0.6111
Timeslot 11 Precision: 0.5989
Timeslot 12 Precision: 0.3846
Timeslot 13 Precision: 0.5972
Timeslot 14 Precision: 0.5587
Timeslot 15 Precision: 0.5989
Timeslot 16 Precision: 0.4583
Timeslot 17 Precision: 0.5968
Timeslot 18 Precision: 0.5960
Timeslot 19 Precision: 0.5944
Timeslot 20 Precision: 0.4318
Timeslot 21 Precision: 0.6037
Timeslot 22 Precision: 0.5689
Timeslot 23 Precision: 0.6278
Timeslot 24 Precision: 0.5333
Timeslot 25 Precision: 0.5435
Timeslot 26 Precision: 0.5822
Timeslot 27 Precision: 0.5992
Model saved to doraemon.joblib


In [32]:
predictor.load_model('doraemon.joblib')

Model loaded from doraemon.joblib


In [33]:
predictor.load_data('TEST_CDNA.csv', 'TEST_HISTORY.csv')
test_X, test_y = predictor.prepare_training_data()

In [34]:
print(test_X.shape)

(68450, 110)


In [35]:
predictor.fine_tune(test_X, test_y)
predictor.save_model('doraemon_sinchan.joblib')

Model saved to doraemon_sinchan.joblib


In [36]:
rankings = predictor.predict_best_timeslots(test_X)

In [37]:
print(len(rankings))

68450


In [38]:
submission = predictor.create_submission(test_X)
print(f"Submission shape: {submission.shape}")
print(submission.head())

Submission shape: (65708, 2)
                                       customer_code  \
0  2cf9e9b31ca760b8772d8e136e85c640df36b491ba8be7...   
1  34481740a44eae239d61c7c0526dc42dc221a94939a8c1...   
2  bb5d102bb50fd308aee248d0fa5cdddbf1a29e263d8e2d...   
3  8229cb51e7392dc37cac4fd94959699bca2c9424a8548c...   
4  f812ad81bc6939bab1856476f12ee2ff8100b8308a3b1c...   

                               predicted_slots_order  
0  [slot_10, slot_27, slot_7, slot_14, slot_25, s...  
1  [slot_9, slot_25, slot_22, slot_5, slot_18, sl...  
2  [slot_18, slot_14, slot_10, slot_6, slot_21, s...  
3  [slot_6, slot_5, slot_13, slot_21, slot_10, sl...  
4  [slot_17, slot_26, slot_18, slot_12, slot_25, ...  
