In [1]:
import pandas as pd
import time

### Scale Numerical Features

In [2]:
all_features = ["engaged_with_user_follower_count", "engaged_with_user_following_count", \
                "enaging_user_follower_count", "enaging_user_following_count"]
train = pd.read_csv("s3://recsys-challenge-2020/training.tsv", encoding="utf-8",
                    names = all_features, usecols= [10, 11, 15, 16], sep="\x01")

test = pd.read_csv("s3://recsys-challenge-2020/val_26_04_2020.tsv", encoding="utf-8",
                    names = all_features, usecols= [10, 11, 15, 16], sep="\x01")

train_test_combined = pd.concat([train,test],keys=[0,1])

from sklearn import preprocessing

x = train_test_combined.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)
df.index = train_test_combined.index
train_test_combined_scaled = pd.concat([train_test_combined, df], axis = 1)
train_scaled, test_scaled = train_test_combined_scaled.xs(0),train_test_combined_scaled.xs(1)
train_scaled_only = train_scaled[[0, 1, 2, 3]]
test_scaled_only = test_scaled[[0, 1, 2, 3]]
train_scaled_only.columns = ['a', 'b', 'c', 'd']
test_scaled_only.columns = ['a', 'b', 'c', 'd']

### Read Train

In [5]:
all_features = ["tweet_type", "Language", "enaged_with_user_id",\
                "engaged_with_user_is_verified", "engaging_user_id", "enaging_user_is_verified",\
                "engagee_follows_engager", "reply_timestamp"]

train = pd.read_csv("s3://recsys-challenge-2020/training.tsv", encoding="utf-8",
                    names = all_features, usecols= [6, 7, 9, 12, 14, 17, 19, 20], sep="\x01")
train['reply_bool'] = train.reply_timestamp.fillna(0)
train.loc[train.reply_bool != 0.0, 'reply_bool'] = 1.0
train = train[["tweet_type", "Language", "enaged_with_user_id",\
                "engaged_with_user_is_verified", "engaging_user_id", "enaging_user_is_verified",
               "engagee_follows_engager", "reply_bool"]]

### Read Test

In [7]:
all_features = ["tweet_type", "Language", "enaged_with_user_id",\
                "engaged_with_user_is_verified", "engaging_user_id", "enaging_user_is_verified",\
                "engagee_follows_engager", "reply_timestamp"]

test = pd.read_csv("s3://recsys-challenge-2020/val_26_04_2020.tsv", encoding="utf-8",
                    names = all_features, usecols= [6, 7, 9, 12, 14, 17, 19, 20], sep="\x01")
test['reply_bool'] = test.reply_timestamp.fillna(0)
test.loc[test.reply_bool != 0.0, 'reply_bool'] = 1.0
test = test[["tweet_type", "Language", "enaged_with_user_id",
                "engaged_with_user_is_verified", "engaging_user_id", "enaging_user_is_verified", 
             "engagee_follows_engager", "reply_bool"]]

In [58]:
train_df = pd.concat([train_scaled_only, train], axis = 1)
test_df = pd.concat([test_scaled_only, test], axis = 1)

In [10]:
def bool_to_str(value):
    "value should be a bool"
    return 'Yes' if value else 'No'

In [59]:
def construct_line(row, columns, dtypes, string_dict, file_obj, index, time1):
    
    if index % 1000000 == 0:
        print(str(time.time() - time1) +": "+ str(index))
        
    global categorical_index
    numerical_index = 0
    new_line = str(row[-1])+ " "
    for i in range(len(columns) - 1):
        value_at_i = row[i]
        dtype_at_i = dtypes[i]
        column_name_at_i = columns[i]
        
        if ((dtype_at_i == object) or (dtype_at_i == bool)): # Categorical Features
            if (dtype_at_i == bool):
                value_at_i =  bool_to_str(value_at_i)
            
            value_to_be_found = column_name_at_i +"_"+ value_at_i
            
            if value_to_be_found in string_dict:
                indexed_value = string_dict[value_to_be_found]
                new_line = new_line + str(indexed_value)+":1 "
            
            else:
                indexed_value = categorical_index
                categorical_index = categorical_index + 1
                new_line = new_line + str(indexed_value)+":1 "
                string_dict[value_to_be_found] = indexed_value
                
        else: # Numerical Features
            new_line = new_line + str(numerical_index) + ":" +str(value_at_i)+" "
            numerical_index = numerical_index + 1
    file_obj.write(new_line+"\n")

### Write Train and Test Simultaneously

In [60]:
cnames_numerical = list(train_df.select_dtypes(exclude=['object', 'bool']).columns)
categorical_index = len(cnames_numerical) - 1 # Categorical indices start from here.
string_dict = {}
f_train_df = open("f1/like/train_df.csv", 'a')
print(len(train_df))
time1 = time.time()
train_df.apply(lambda x : construct_line(x, train_df.columns, train_df.dtypes, string_dict, f_train_df, x.name,\
                                         time1), axis = 1)
f_train_df.close()
print(len(test_df))
f_test_df = open("f1/like/test_df.csv", 'a')
test_df.apply(lambda x : construct_line(x, test_df.columns, test_df.dtypes, string_dict, f_test_df, x.name,\
                                       time1), axis = 1)
f_test_df.close()

148075238
98.24931406974792: 0
475.55212593078613: 1000000
857.9525101184845: 2000000
1236.6246552467346: 3000000
1617.9577896595001: 4000000
1999.901519536972: 5000000
2382.9897944927216: 6000000
2764.2796103954315: 7000000
3146.476989507675: 8000000
3528.9119856357574: 9000000
3911.4429399967194: 10000000
4293.895839214325: 11000000
4678.222537517548: 12000000
5060.8272976875305: 13000000
5443.9498200416565: 14000000
5827.515817403793: 15000000
6210.165724515915: 16000000
6593.144823074341: 17000000
6976.708079099655: 18000000
7359.252466917038: 19000000
7740.870858669281: 20000000
8123.5502252578735: 21000000
8505.957402467728: 22000000
8888.170521497726: 23000000
9271.358973264694: 24000000
9653.465132713318: 25000000
10035.567963838577: 26000000
10418.494748353958: 27000000
10800.842840194702: 28000000
11184.410460472107: 29000000
11568.292068719864: 30000000
11950.819841623306: 31000000
12333.459243297577: 32000000
12713.593706130981: 33000000
13096.919048786163: 34000000
13479.1

In [65]:
mask = (test_df.reply_bool == 0.0)
len(test_df[mask == True])

12984679