In [1]:
# general libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib import cm
%matplotlib inline
import seaborn as sns
import itertools

from collections import Counter
# text processing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

# Regular Expression
import re
import string
import os

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize, sent_tokenize

#Quality of image:
%config InlineBackend.figure_format = 'svg'

from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer

In [2]:
def miss_value_func(in_data, classificatio_flag=0, only_miss_col=0):
    """
    The dataframe is supposed to have target column with name of 'target'.
    """
    out_table = pd.DataFrame(in_data.isnull().sum(), columns = ['miss_value'])
    out_table['percentage'] = (100 * out_table['miss_value'])/len(in_data)
    out_table.sort_values(by='miss_value', ascending=False, inplace=True)

    lst_miss_col = [i for i in range(len(out_table)) if (out_table.iloc[i][1]!=0)]    
    print('The input data frame has {:d} features.'.format(in_data.shape[1]))
    print('The data frame has {:d} columns with missing values'.format(len(lst_miss_col)))
    
    
    if(only_miss_col == 1):
        len_miss = len(lst_miss_col)
        out_table = out_table.iloc[:len_miss, :]
    if (classificatio_flag == 0):
        return out_table
    
    if classificatio_flag == 1:
        try:
            uniq_levels = list(in_data['target'].unique())
            uniq_n = len(uniq_levels)
            n_total = len(in_data)
            n_target1 = len(in_data[in_data['target']==1])
            n_target0 = n_total - n_target1
            flag=1
        except:
            print("Error!  The dataframe does not have column with name 'target'.  Try again...")
            flag = 0
        
        if flag == 1:
            
            for lev in uniq_levels:
                str_col = 'count_miss: target=' + str(lev)
                out_table[str_col] = 0 
            
            for lev in uniq_levels:
                str_col = 'percent_miss: target=' + str(lev)
                out_table[str_col] = 0 
                
            for lev in uniq_levels:
                str_col = 'norm_miss: target=' + str(lev)
                out_table[str_col] = 0 
            
            len_miss = len(lst_miss_col)
            for i, col in enumerate(lst_miss_col):
                for j, lev in enumerate(uniq_levels):
                    count = in_data[out_table.index[i]][in_data['target']==lev].to_frame().isnull().sum()
                    percent = (count / out_table.iloc[i, 0])*100
                    out_table.iloc[i, 2+j] = count[0]
                    out_table.iloc[i, 2+j+uniq_n] = percent[0]
                    if(lev == 0):
                        out_table.iloc[i, 2+j+2*uniq_n] = 100*(count[0]/n_target0)
                    elif(lev== 1):
                        out_table.iloc[i, 2+j+2*uniq_n] = 100*(count[0]/n_target1)
            return out_table
        
def make_count_plot(df, cols, annotate=False, xRotation = 0, y_min=None, y_max=None, title=None):

    totalcnt = df.shape[0]
    cnt = len(cols)
    if(cnt == 1):
        fig, ax = plt.subplots(figsize=(5,3*cnt))
        col = cols[0]
        sns.countplot(x=df[col], ax=ax);
        ax.set_title(title)
        ax.set_xlabel(col)
        ax.set_ylabel('Frequency');

        if(y_min!= None):
            ax.set(ylim=(y_min, y_max))
        

        if(xRotation != 0):
            ax.tick_params(axis='x', labelrotation=90)
            
        if(annotate == True):
                for p in ax.patches:
                    x=p.get_bbox().get_points()[:,0]
                    y=p.get_bbox().get_points()[1,1]
                    ax.annotate('{:.1f}%'.format(y*100/totalcnt), (x.mean(), y), ha='center', va='bottom')
                    
    
    if(cnt>1):
        fig, axes = plt.subplots(cnt, 1, figsize=(5,3*cnt))
        for i, col in enumerate(cols):
            sns.countplot(x=df[col], ax=axes[i]);
            #axes[i].set_title('Distribution')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Frequency');
            if(y_min!= None):
                ax.set(ylim=(y_min, y_max))
            
            
            
            if(xRotation != 0):
                axes[i].tick_params(axis='x', labelrotation=90)
            
            if(annotate == True):
                for p in axes[i].patches:
                    x=p.get_bbox().get_points()[:,0]
                    y=p.get_bbox().get_points()[1,1]
                    axes[i].annotate('{:.1f}%'.format(y*100/totalcnt), (x.mean(), y), ha='center', va='bottom')

        fig.tight_layout()  

def plotResult(result_df, yLabel = 'Frequency', yRange=None, title=None):
    #result_df = 100 * result_df
    #fig, ax = plt.subplots(figsize=(8, 5))
    #metrics.plot(kind = 'barh', ax=ax)
    #metrics.plot(kind = 'bar', ax=ax)
    #ax.grid()
    cnt = len(result_df.index)
    plt.set_cmap('Set1')


    #result_df.plot(kind='bar', ylim=(0.75,1), figsize=(11,6), align='center')
    result_df.plot(kind='bar', ylim=yRange, figsize=(6,4), align='center')
    plt.xticks(np.arange(cnt), result_df.index)
    plt.ylabel(yLabel)
    if(title!= None):
        plt.title(title)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    
    
def makeHist(df, col, title=None, x_range=None, bins='auto', x_tickLst = None):
    fig, axes = plt.subplots(1, 2, figsize=(6, 3))
    lblLst = ['ham', 'spam']
        
    if (x_range != None):
        df = df.loc[(df[col] >= x_range[0]) & (df[col] <= x_range[1])]
    
    for label, ax in zip(lblLst, axes.flatten()):
        if(label == 'ham'):
            color = 'cornflowerblue'
        else:
            color='orangered'
        sns.histplot(df.loc[df['label'] == label, col], color = color, kde=False, cbar_kws=dict(edgecolor="k"), ax=ax, stat='probability', bins=bins);
        if(x_range!= None):
            ax.set(xlim=x_range)
        
        ax.set_xlabel(label, fontweight="bold")
        ax.set_ylabel('Frequency', fontweight="bold");
        if(x_tickLst != None):
            ax.set_xticks(x_tickLst)
            #ax.set_xticklabels(['zero','One'])
        ax.yaxis.grid(False)
        
    if(title != None):
        fig.suptitle(title, fontsize=12, fontweight="bold")
    
    fig.tight_layout()   
    
def makeHistOneFigure(df, col, x_label = 'x_label', y_label = 'Probability', title=None, x_range=None, bins=None):
    fig, ax = plt.subplots(1, 1, figsize=(6, 3))
    ax.hist(df.loc[df['label'] == 'ham', col], range=x_range, density=True, bins=bins, label='ham', color='cornflowerblue')
    ax.hist(df.loc[df['label'] == 'spam', col], range=x_range, density=True, bins=bins, label='spam', color='orangered', alpha=0.8)
    ax.set_xlabel(x_label, fontweight="bold")
    ax.set_ylabel(y_label, fontweight="bold");
    ax.set_title(title, fontweight="bold");
    ax.xaxis.grid(False)
    ax.legend();
    fig.tight_layout() 
    
def make_box_plot(df, col,  refCol, x_range= None, y_label=None, title=None,xRotation=None):

    if (x_range != None):
        df = df.loc[(df[refCol] >= x_range[0]) & (df[refCol] <= x_range[1])]
        
    fig, ax = plt.subplots(1, 1, figsize=(6,3))
    sns.boxplot(x=df[col], y=df[refCol], ax=ax);
    patches = ax.artists
    patches[0].set_facecolor('cornflowerblue')
    patches[1].set_facecolor('coral')
    
    
    if(y_label != None):
        ax.set_ylabel(y_label, fontweight="bold")
    if(title != None):
        ax.set_title(title, fontweight="bold")
    ax.set_xlabel('')
    ax.xaxis.grid(False)

    if(xRotation != None):
        ax.tick_params(axis='x', labelrotation=xRotation)
    fig.tight_layout()
    
def make_multiple_bar_plot(data1, data2, sort_index = True, max_index= None, y_label=None, figTitle=None,xRotation=None, annotation=True, xtick_label=None):
    """
    data1: series made by value_counts()
    data2: series made by value_counts()
    """
    if(max_index != None):
        data1 = data1.loc[data1.index<=max_index]
        data2 = data2.loc[data2.index<=max_index]
        
    if(sort_index == True):
        data1 = data1.sort_index()
        data2 = data2.sort_index()
    data1 *= 100
    data2 *= 100
    fig, axes = plt.subplots(1, 2, figsize=(6, 3))
    lblLst = ['ham', 'spam']
    
    for i, ax in enumerate(axes.flatten()):
        if(i == 0):
            color='cornflowerblue'
            data1.plot(kind='bar', ax=ax, color=color)
        else:
            color = 'coral'
            data2.plot(kind='bar', ax=ax, color=color)
        #sns.barplot(x=data1.)
        label = lblLst[i]
        ax.set_xlabel(label, fontweight="bold")
        ax.set_ylabel('Frequency (%)', fontweight="bold");
        ax.yaxis.grid(False)
        
        if(xtick_label != None):
            ax.set_xticklabels(xtick_label)

        
        if(xRotation != None):
            ax.tick_params(axis='x', labelrotation=xRotation)
        
            
        if(annotation == True):
            for p in ax.patches:
                x=p.get_bbox().get_points()[:,0]
                y=p.get_bbox().get_points()[1,1]
                ax.annotate('{:.1f}%'.format(y), (x.mean(), y), ha='center', va='bottom')
    if(figTitle != None):
        fig.suptitle(figTitle, fontsize=12, fontweight="bold")
    fig.tight_layout()    
    
def plotComparison_df(result_df, xLabel = 'xLabel', yLabel = 'Frequency', xRotation = 0, yRange=None, title=None):
    cnt = len(result_df.index)  
    fig, ax = plt.subplots(figsize=(6, 3))
    result_df.plot(kind='bar', ylim=yRange, figsize=(6,3), align='center', ax=ax)
    
    ax.set_xticks(np.arange(cnt))
    ax.set_xticklabels(result_df.index)
    ax.set_xlabel(xLabel, fontweight="bold")
    ax.set_ylabel(yLabel, fontweight="bold")
    ax.xaxis.grid(False)

    if(title!= None):
        ax.set_title(title, fontweight="bold")

    if(xRotation != 0):
        ax.tick_params(axis='x', labelrotation=xRotation)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    
    
def prepareWordCloud(text, lemmatizerFlag = 1):
    """
    input: 
    text: a text message
    lemmatizerFlag: a flag used for type of text normalization. It is an integer {0, 1}.
                    If 0: stem of word will be used,
                    If 1: lemma of word will be used,
    output: a token list of the message after applying the following steps:
    1- Tokenizing the input text
    2- Removing punctuations and digits 
    3- Transforming into lowercase
    4- Applying stem of word in order to get more hemogenized token list
    """
    # 1- Tokenizing the input text
    tokenLst = nltk.word_tokenize(text)
    
    # 2- Removing punctuations and digits
    tokenLst = [word for word in tokenLst if word.isalpha()]
    
    # 3- Transforming into lowercase
    tokenLst = [word.lower() for word in tokenLst]

    # 4 Removing stopwords
    stopwrodLst = stopwords.words('english')
    tokenLst = [word for word in tokenLst if word not in stopwrodLst]

    # 5 Applying lemma or stem of word in order to get more hemogenized token list
    if(lemmatizerFlag == 0):
        porter = PorterStemmer()
        tokenLst = [porter.stem(word) for word in tokenLst]
    elif(lemmatizerFlag == 1):
        lemmatizer = WordNetLemmatizer()
        tokenLst = [lemmatizer.lemmatize(word) for word in tokenLst]
    return tokenLst

def prepareWordCloud_step2(token_data):
    """
    input: a series of tokenized message
    output: a token list which filters infrequenct words (frequency < 25).
    The process is as follow:
    1- Making a variable including all tokens together
    2- Preparing the list of infrequenct words
    3- Filtering infrequenct words
    """
    # 1- Making a variable including all tokens together
    allTokens = []
    for tokeLst in token_data:
        allTokens += tokeLst

    # 2- Preparing a list including infrequent words (frequency<20)
    frequencyList = nltk.FreqDist(allTokens)
    infrequentLst = [word for word in allTokens if frequencyList[word] <20]

    # 3- Filtering infrequenct words
    #res = token_data.apply(lambda row: [word for word in row if word not in infrequentLst], axis = 1)
    res = token_data.apply(lambda row: [word for word in row if word not in infrequentLst])
    return res

def wordCloudFunc(inputData, isToken = 0, max_words=100, width=1600, height=800, figsize=(15,10), bg_color='k'):
    """
    input:
    inputData: data for making word cloud
    isToken: a flag {0, 1} shows if input is in form of token or not: 0 used for text input, 1 used for token input
    export_file_name: a name used for saving file
    max_words: maximum words for word cloud, default=100,
    width= width of word cloud, default=1600, 
    height= height of word cloud, default=800, 
    figsize=size of return figure, default=(15,10), 
    facecolor=color of word cloud, default='k'
    output: 
    word cloud figure
    """
    if isToken ==1:
        inputData = str(inputData)
    #wordcloud = WordCloud(min_word_length =1,max_words=max_words, width=width, height=height, random_state=1).generate(inputData)
    wordcloud = WordCloud(max_words=max_words, width=width, height=height, background_color=bg_color).generate(inputData)
    plt.figure(figsize=figsize)
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.tight_layout(pad=0)
    
    
def make_bar_plot(X, y, title='Title', xlbl='X_Label', ylbl='Y_Label', xRotation=90, annotation=False):
    fig, ax = plt.subplots(figsize=(6, 3))
    sns.barplot(X, y, ax=ax)
    patches = ax.patches
    n = len(patches)
    if(n==2):
        patches[0].set_facecolor('cornflowerblue')
        patches[1].set_facecolor('orangered')
    else:
        for i in range(n):
            x = (n-1)/n - i/n
            patches[i].set_facecolor(plt.cm.brg(x))
            
    if(annotation == True):
        for p in ax.patches:
            x=p.get_bbox().get_points()[:,0]
            y=p.get_bbox().get_points()[1,1]
            ax.annotate('{:.1f}%'.format(y), (x.mean(), y), ha='center', va='bottom')

    if(xRotation != 0):
        ax.tick_params(axis='x', labelrotation=xRotation)
    
    #x_pos = np.arange(len(df["word"]))
    #plt.xticks(x_pos, df["word"])
    ax.set_title(title, fontweight="bold")
    ax.set_xlabel(xlbl, fontweight="bold")
    ax.set_ylabel(ylbl, fontweight="bold")