Description of the files

 - ratings.csv - ratings given by an employee of a company on the given date(higher rating means a better experience)
 - remarks.csv - remarks entered by the employee about the company on the given date(emp along with comp should make a unique id)
 - remarks_supp_opp.csv - Whether an employee was in favor of or opposed the remark of another employee
 - train.csv - the training set, with the prediction field 'left' specifying whether the employee left the company in the coming few months after the end of data collection(this is the field you need to make predictions for)
 - test.csv - the test set
 - baseline_0.csv - Sample submission for baseline_0
 
Description of the fields

 - emp - employee identification within a company( Ignore any negative numbers in this field )
 - comp - hashed company name
 - lastratingdate - last date on which the employee provided any rating on the platform
 - txt - the remark entered by the employee - converted into *(number of * specify the length of the remark)
 - Other fields are self explanatory

In [1]:
# Team_11
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [2]:
remarks = pd.read_csv('prml20jslot\\remarks.csv')
ratings = pd.read_csv('prml20jslot\\ratings.csv')
remarks_supp_opp = pd.read_csv('prml20jslot\\remarks_supp_opp.csv')
train = pd.read_csv('prml20jslot\\train.csv')
test = pd.read_csv('prml20jslot\\test.csv')
baseline = pd.read_csv('prml20jslot\\baseline_0.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
remarks_supp_opp.head()

Unnamed: 0,emp,comp,support,oppose,remarkId
0,307,bnivzbfi,True,False,fvwadfrj
1,36,bnivzbfi,True,False,fvwadfrj
2,276,bnivzbfi,True,False,fvwadfrj
3,24,bnivzbfi,True,False,fvwadfrj
4,382,bnivzbfi,True,False,xrrfkgap


In [5]:
train['left'].value_counts()

0    2932
1     594
Name: left, dtype: int64

In [6]:
# oversampling of the minority class
majority = train[train.left==0]
minority = train[train.left==1]

minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=2)
final_df = pd.concat([majority, minority_upsampled])

In [7]:
final_df['left'].value_counts()

1    2932
0    2932
Name: left, dtype: int64

In [8]:
# dictionary to map every company to a unique number
companies = train['comp'].unique()
dict_company = {}
num = 1
for company in companies:
    dict_company[company]=num
    num +=1

In [9]:
# creating dictionaries with average and latest rating for each unique emp+company combination
g = ratings.groupby(['emp','comp'])
avg_rating_dict = {}
latest_rating_dict = {}     # 4377 is the length

for name, group in g:
    unique_key = str(name[0])+'_'+str(name[1])
    avg_rating = group['rating'].mean()
    latest_rating = int(group['rating'].values[-1])
    avg_rating_dict[unique_key] = avg_rating
    latest_rating_dict[unique_key] = latest_rating

In [10]:
# dictionary with avg remark length
remarks = remarks.dropna()
remarks['emp'] = abs(remarks['emp'])
g2 = remarks.groupby(['emp','comp'])
avg_remark_length = {}
common_remark = {}

for name, group in g2:
    unique_key = str(name[0])+'_'+str(name[1])
    all_remarks = group['txt'].values
    lengths = [len(i) for i in all_remarks]
    avg_length = np.array(lengths).mean()
    avg_remark_length[unique_key] = avg_length

In [11]:
# dictionary with no of supporting/opposing remarks
remarks_supp_opp = remarks_supp_opp.dropna()
remarks_supp_opp['emp'] = abs(remarks_supp_opp['emp'])
g3 = remarks_supp_opp.groupby(['emp','comp'])
support_dict = {}
oppose_dict = {}

for name, group in g3:
    unique_key = str(name[0])+'_'+str(name[1])
    support_dict[unique_key] = len(group[group['support']==True])
    oppose_dict[unique_key] = len(group[group['oppose']==True])

In [21]:
supp_mean = np.array(list(support_dict.values())).mean()
opp_mean = np.array(list(oppose_dict.values())).mean()

def remark_length(x):
    try:
        return avg_remark_length[str(x['emp'])+'_'+str(x['comp'])]
    except:
        return 86.776 #the overall avg
    
def support(x):
    try:
        return support_dict[str(x['emp'])+'_'+str(x['comp'])]
    except:
        return supp_mean

def oppose(x):
    try:
        return oppose_dict[str(x['emp'])+'_'+str(x['comp'])]
    except:
        return opp_mean


def process_df(df):
    
    #df['avg_rating'] = df.apply(lambda x: avg_rating_dict[str(x['emp'])+'_'+str(x['comp'])], axis=1)
    df['Support'] = df.apply(lambda x: support(x), axis=1)
    df['Oppose'] = df.apply(lambda x: oppose(x), axis=1)
    df['Avg_remark_length'] = df.apply(lambda x: remark_length(x), axis=1)
    df['latest_rating'] = df.apply(lambda x: latest_rating_dict[str(x['emp'])+'_'+str(x['comp'])], axis=1)
    df['comp'] = df.apply(lambda x: dict_company[x['comp']], axis=1)
    df['Date'] = df.apply(lambda x: int(x['lastratingdate'][0:2]), axis=1)
    df['Month'] = df.apply(lambda x: int(x['lastratingdate'][3:5]), axis=1)
    df['Year'] = df.apply(lambda x: int(x['lastratingdate'][6:]), axis=1)
    
    return df

In [22]:
train = process_df(final_df)

In [24]:
train

Unnamed: 0,id,emp,comp,lastratingdate,left,Support,Oppose,Avg_remark_length,latest_rating,Date,Month,Year
1,4349,250,2,19-03-2017,0,233.000000,29.000000,136.326923,4,19,3,2017
2,945,134,3,21-09-2016,0,32.000000,0.000000,45.800000,3,21,9,2016
3,4553,164,4,17-03-2017,0,19.000000,1.000000,21.200000,2,17,3,2017
4,941,129,3,04-04-2016,0,7.000000,1.000000,75.000000,3,4,4,2016
5,2694,258,5,06-02-2017,0,63.000000,10.000000,237.250000,3,6,2,2017
6,235,277,6,03-01-2017,0,2.000000,0.000000,86.776000,4,3,1,2017
7,2289,18,7,20-03-2017,0,5.000000,0.000000,136.000000,2,20,3,2017
8,3662,21,8,14-03-2017,0,85.558965,15.812575,86.776000,3,14,3,2017
9,4270,108,2,07-12-2016,0,1.000000,0.000000,86.776000,3,7,12,2016
10,3022,28,9,25-10-2016,0,85.558965,15.812575,176.500000,4,25,10,2016


In [25]:
# picking X and y from the training data and shuffling randomly
X = np.array(train.drop(['lastratingdate','left'], axis=1))
y = np.array(train['left']).astype('int32')
indices = [i for i in range(len(X))]
np.random.shuffle(indices)
X = X[indices]
y = y[indices]

In [26]:
num_train = int(0.8*len(X))
X_train = X[:num_train]
y_train = y[:num_train]
X_valid = X[num_train:]
y_valid = y[num_train:]

In [33]:
# Random forest classifier
clf = RandomForestClassifier(100, class_weight="balanced", random_state=4)
clf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=4, verbose=0, warm_start=False)

In [34]:
print("Train f1=",f1_score(y_train,clf.predict(X_train)))
print('Validation f1=', f1_score(y_valid, clf.predict(X_valid)))
print('Train score=',clf.score(X_train, y_train))
print('Validation score=',clf.score(X_valid, y_valid))

Train f1= 1.0
Validation f1= 0.9692946058091286
Train score= 1.0
Validation score= 0.96845694799659


In [41]:
# Adaboost classifier
clf2 = AdaBoostClassifier(base_estimator=clf, n_estimators=50, random_state=4)
clf2.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=RandomForestClassifier(bootstrap=True,
                                                         class_weight='balanced',
                                                         criterion='gini',
                                                         max_depth=None,
                                                         max_features='auto',
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         n_estimators=100,
                  

In [42]:
print("train f1=",f1_score(y_train,clf2.predict(X_train)))
print('Validation f1=', f1_score(y_valid, clf2.predict(X_valid)))
print('Train score=',clf2.score(X_train, y_train))
print('Validation score=',clf2.score(X_valid, y_valid))

train f1= 1.0
Validation f1= 0.9684908789386402
Train score= 1.0
Validation score= 0.9676044330775788


In [37]:
# Preprocessing the test data
test = pd.read_csv('prml20jslot\\test.csv')
test = process_df(test)
X_test = np.array(test.drop(['lastratingdate'], axis=1))

In [43]:
y_pred = clf2.predict(X_test)

In [44]:
test['left'] = y_pred

In [45]:
test[['id', 'left']].to_csv('DC_26.csv', index=False)

In [None]:
remarks_supp_opp