Data Cleaning and Processing

In [None]:
#CNE-6 model factor data proccessing
def factor_data_process(melt_table, factor):
    #depolarization
    melt_table[factor] = melt_table.groupby('pubDate')[factor].transform(winsorize,
                                            qrange=[0.025, 0.975], inclusive=True, inf2nan=True, axis=0)
    print('winsorize finished')
    #Completion of missing values
    missing_value_rows = melt_table[melt_table[factor].isnull()]
    if not missing_value_rows.empty:
        missing_value_rows_index = missing_value_rows.index.tolist()
        missing_stock_code = missing_value_rows['code'].tolist()
        missing_date = missing_value_rows['pubDate'].tolist()
        fill_values = []
        for i in np.arange(len(missing_stock_code)):
            industry_dict = get_industry(missing_stock_code[i], date = missing_date[i])
            if len(list(industry_dict[missing_stock_code[i]].keys())) != 0:
                if 'sw_l1' in industry_dict[missing_stock_code[i]]:
                    industry = industry_dict[missing_stock_code[i]]['sw_l1']['industry_code']
                else :
                    first_key = list(industry_dict[missing_stock_code[i]].keys())[0]
                    industry = industry_dict[missing_stock_code[i]][first_key]['industry_code']
                industry_stocks = get_industry_stocks(industry, date = missing_date[i])
                industry_dic = lib.get_data(industry_stocks, [missing_date[i]], [factor], 
                                                    query_limit=False)
                melt_df = pd.melt(industry_dic[factor].reset_index(),id_vars=['index'],
                                      value_vars=securities,var_name='code',value_name=factor)
                val = np.mean(melt_df[factor])
                melt_table.at[missing_value_rows_index[i], factor] = val
    melt_table = melt_table.dropna() #missing values in related industries   
    print('nan value fill finished')
    #neutralization
    for pubDate, group in melt_table.groupby('pubDate'):
        group.set_index('code', inplace=True)
        group[factor] = neutralize(group[factor], how=['market_cap'], date=pubDate, axis=0)
        group.reset_index(inplace=True)
        group = group.rename(columns = {'index':'code'})
    print('neutralize finished')     
    #standardization
    melt_table[factor] = standardlize(melt_table[factor], inf2nan=True, axis=0)
    print('standardlize finished')
    return melt_table
    
#earnings yield
def rates_attain(securities, start_date, end_date):
    daily_price = get_price(securities,start_date = start_date, end_date = end_date,fields=('close',
                                                                                        ))['close']
    rates = daily_price/daily_price.shift(1) - 1
    rates_tbl = pd.melt(rates.reset_index(),id_vars=['index'],value_vars=securities, 
                   var_name='code', value_name='rates').rename(columns = {'index':'pubDate'})
    rates_tbl['pubDate'] = pd.to_datetime(rates_tbl['pubDate'])
    #Remove unlisted/delisted
    rates_tbl = rates_tbl[rates_tbl['rates'] != 0.0]
    return rates_tbl.dropna()  

Feature Engineering

In [None]:
import re

def data_cleaning(new_train):
    new_train['subject'] = new_train['subject'].astype(str)
    new_train['email'] = new_train['email'].astype(str)
    #subject
    new_train['cap_subject'] = new_train['subject'].map(lambda calc:len(re.sub(r'[^A-Z]+',"",calc)))
    new_train['mark_subject'] = new_train['subject'].map(lambda calc:len(re.sub(r'[^!]+',"",calc)))
    new_train['reply'] = new_train['subject'].map(lambda calc:len(re.findall(r'Re:', calc)))
    new_train['Free'] = new_train['subject'].map(lambda calc:len(re.findall(r'FREE', calc)))
    new_train['star'] = new_train['subject'].map(lambda calc:len(re.sub(r'[^*]+',"",calc)))
    new_train['turn_subject'] = new_train['subject'].map(lambda calc:len(re.sub(r'[^\n]+',"",calc)))
    new_train[':_subject'] = new_train['subject'].map(lambda calc:len(re.sub(r'[^:]+',"",calc)))
    new_train['[]'] = new_train['subject'].map(lambda calc:len(re.sub(r'[^\[\]]+',"",calc)))
    #body
    new_train['cap_body'] = new_train['email'].map(lambda calc:len(re.sub(r'[^A-Z]+',"",calc))/len(calc))
    new_train['turn_body'] = new_train['email'].map(lambda calc:len(re.sub(r'[^\n]+',"",calc)))
    new_train['dollar_sign'] = new_train['email'].map(lambda calc:len(re.findall(r'$', calc)))
    new_train['hash'] = new_train['email'].map(lambda calc:len(re.findall(r"\\", calc)))
    new_train['#'] = new_train['email'].map(lambda calc:len(re.findall(r'#', calc)))
    new_train['-'] = new_train['email'].map(lambda calc:len(re.findall(r"-", calc)))
    new_train['?'] = new_train['email'].map(lambda calc:len(re.findall(r'\?', calc)))
    new_train['/'] = new_train['email'].map(lambda calc:len(re.findall(r'\/', calc)))
    new_train[':'] = new_train['email'].map(lambda calc:len(re.findall(r'\:', calc)))
    some_words = ['drug', 'bank', 'prescription', 'memo', 'private', '<html>','dear','100']
    X_train = words_in_texts(some_words, new_train['email'])
    X_train = np.array([np.append(X_train[i],new_train['mark_subject'][i]) for i in range(len(X_train))])
    X_train = np.array([np.append(X_train[i],new_train['cap_body'][i]) for i in range(len(X_train))])
    X_train = np.array([np.append(X_train[i],new_train['cap_subject'][i]) for i in range(len(X_train))])
    X_train = np.array([np.append(X_train[i],new_train['turn_body'][i]) for i in range(len(X_train))])
    X_train = np.array([np.append(X_train[i],new_train['reply'][i]) for i in range(len(X_train))])
    X_train = np.array([np.append(X_train[i],new_train['dollar_sign'][i]) for i in range(len(X_train))])
    X_train = np.array([np.append(X_train[i],new_train['hash'][i]) for i in range(len(X_train))])
    X_train = np.array([np.append(X_train[i],new_train['star'][i]) for i in range(len(X_train))])
    X_train = np.array([np.append(X_train[i],new_train['#'][i]) for i in range(len(X_train))])
    X_train = np.array([np.append(X_train[i],new_train['-'][i]) for i in range(len(X_train))])
    X_train = np.array([np.append(X_train[i],new_train['?'][i]) for i in range(len(X_train))])
    X_train = np.array([np.append(X_train[i],new_train['[]'][i]) for i in range(len(X_train))])
    return new_train, X_train

from sklearn.linear_model import LogisticRegression

new_train, X_train = data_cleaning(train)

model = LogisticRegression()
model.fit(X_train, Y_train)

training_accuracy = model.score(X_train, Y_train)
print("Training Accuracy: ", training_accuracy)

SVM Machine Learning

In [None]:
def train_and_evaluate_model(train_data, train_label, val_data, val_label, train_sizes):
    train_accuracies = []
    val_accuracies = []
    #flatten_data:
    train_data = train_data.reshape((train_data.shape[0], -1))
    val_data = val_data.reshape((val_data.shape[0], -1))

    for size in train_sizes:
        train_data_subset = train_data[:size]
        train_label_subset = train_label[:size]

        model = SVC(kernel='linear')
        model.fit(train_data_subset, train_label_subset)

        # Predict on the training subset and validation set
        train_label_pred = model.predict(train_data_subset)
        val_label_pred = model.predict(val_data)

        # Calculate the accuracies
        train_accuracy = evaluation_metric(train_label_subset, train_label_pred)
        val_accuracy = evaluation_metric(val_label, val_label_pred)

        # Append to the results list
        train_accuracies.append(train_accuracy)
        val_accuracies.append(val_accuracy)

        print(f"Trained with {size} samples, Training accuracy: {train_accuracy}, Validation accuracy: {val_accuracy}")

    return train_accuracies, val_accuracies

def plot_accuracies(train_sizes, train_accuracies, val_accuracies, title):
    plt.figure(figsize=(10, 5))
    plt.plot(train_sizes, train_accuracies, label='Training Accuracy')
    plt.plot(train_sizes, val_accuracies, label='Validation Accuracy')
    plt.xlabel('Number of Training Examples')
    plt.ylabel('Accuracy')
    plt.title(title)
    plt.legend()
    plt.grid()
    plt.show()