In [1]:
def metafeatures_extraction(df,dfname):
    from sklearn.preprocessing import StandardScaler
    import re
    columns_to_keep=['ATTACHMENTS',
           'RESPONSIVE','number_of_receivers',
           'number_of_direct_receivers', 'number_of_cc_receivers',
           'number_of_stopwords', 'number_of_stopwords_normalised',
           'number_of_words', 'has_subject', 'afternoon', 'middle of the night',
           'morning', 'night']
    #extracting hour of sending
    def extract_hour(df):
        df['sending_hour']=0
        for index,row in df.iterrows():
            hour=row['TIMESTAMP'].hour
            df.loc[index,'sending_hour']=hour
        return df


    #extract daypart of sending
    def extract_daypart(df):
        df['daypart']=''
        for index,row in df.iterrows():
            if row['sending_hour']>=0 and row['sending_hour']<6:
                df.loc[index,'daypart']='middle of the night'
            elif row['sending_hour']>=6 and row['sending_hour']<12:
                df.loc[index,'daypart']='morning'
            elif row['sending_hour']>=12 and row['sending_hour']<18:
                df.loc[index,'daypart']='afternoon'
            elif row['sending_hour']>=18 or row['sending_hour']<0:
                df.loc[index,'daypart']='night'
        return df


    #extract total number of receivers
    def number_of_receivers(df):
        df['number_of_receivers']=0
        for index,row in df.iterrows():
            df.loc[index,'number_of_receivers']=len(row['TO'])
        return df


    #extract number of direct and cc receivers
    def number_of_direct_and_cc_receivers(df):
        df['number_of_direct_receivers']=0
        df['number_of_cc_receivers']=0
        for index,row in df.iterrows():
            no_direct_receivers=0
            no_cc_receivers=0
            for receiver in row['TO']:
                if receiver[2]=='To':
                    no_direct_receivers+=1
                if receiver[2]=='Cc':
                    no_cc_receivers+=1
            df.loc[index,'number_of_direct_receivers']=no_direct_receivers
            df.loc[index,'number_of_cc_receivers']=no_cc_receivers
        return df


    #extract number of stopwords
    import nltk
    #nltk.download('stopwords')

    def number_of_words_and_stopwords(df, dfname):
        df['number_of_stopwords']=0
        df['number_of_stopwords_normalised']=0
        df['number_of_words']=0
        if dfname=='enron':
            stop_words = nltk.corpus.stopwords.words('english')
        else:
            stop_words = nltk.corpus.stopwords.words('english')+ nltk.corpus.stopwords.words('dutch')

        for index,row in df.iterrows():
            number_of_stopwords=0
            number_of_words=0
            for word in re.split(' ',row['CONTENT']):
                if len(word)>0:    #check if word is not a blank space ' '
                    number_of_words+=1
                    for stopword in stop_words:
                        if stopword == word:
                            number_of_stopwords+=1
            df.loc[index,'number_of_stopwords']=number_of_stopwords
            df.loc[index,'number_of_stopwords_normalised']=number_of_stopwords/number_of_words
            df.loc[index,'number_of_words']=number_of_words
        return df

    #extract has_subject
    def has_subject(df):
        df['has_subject']=1
        for index,row in df.iterrows():
            if row['SUBJECT']=='':
                df.loc[index,'has_subject']=0
        return df
    
    columns_to_scale=['ATTACHMENTS','number_of_receivers','number_of_direct_receivers','number_of_cc_receivers','number_of_stopwords','number_of_words']

    def scale(df,columns_to_scale):
        scaler = StandardScaler()
        df[columns_to_scale]=scaler.fit_transform(df[columns_to_scale])
        return df
    
    
    def get_dummies(df,columns):
        for column in columns: 
            if df[column].dtype=='int64':
                df_temp=pd.get_dummies(df[column].apply(str))
                df=pd.concat([df,df_temp],axis=1)
                del df[column]
            else: 
                df_temp=pd.get_dummies(df[column])
                df=pd.concat([df,df_temp],axis=1)
                del df[column]           
        return df
    def process_to_table(df,columns_to_keep):
        for column in df.columns:
            if len(column)<3 and column!='TO':
                columns_to_keep.append(column)
        df=df[columns_to_keep]
        return df
    
    df=extract_hour(df)
    df=extract_daypart(df)
    df=number_of_receivers(df)
    df=number_of_direct_and_cc_receivers(df)
    df=number_of_words_and_stopwords(df, dfname)
    df=has_subject(df)
    df=get_dummies(df,['sending_hour','daypart'])
    df=scale(df,columns_to_scale)
    df=process_to_table(df,columns_to_keep)
    
    return df
