# Common Imports

In [3]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium import webdriver    
import time
import pandas as pd
import re
import collections
import nltk
from nltk import word_tokenize, Text, FreqDist
from nltk.corpus import wordnet as wn
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords
wnl = nltk.WordNetLemmatizer()
import pathlib
from newspaper import Article
from urllib.parse import urlparse
# importing itertools for accumulate()
import itertools
# importing functools for reduce()
import functools
from tqdm.notebook import tqdm_notebook
import json
import praw
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import os
import concurrent.futures
import time
from tqdm import tqdm
import gc
from nltk.corpus import stopwords
from IPython.display import display, clear_output
import pickle

# Common Functions

In [None]:
from datetime import datetime

tests_extract_domain = [
    'https://www.independent.co.uk/news/singapore-malaysia-kuala-lumpur-facebook-lawyers-b2017147.html',
    'https://independent.co.uk/news/singapore-malaysia-kuala-lumpur-facebook-lawyers-b2017147.html'
]
def extract_domain(url):
    #pattern = r'(?<=\/|\.)(\w+)(?=\.)'
    pattern = r'(www\.|\..+\b)'
    url = urlparse(url).netloc
    domain = re.sub(pattern,'',url)
    if domain == 'sg':
        domain = 'Yahoo'
    return domain

def preprocess(text):
    text = re.sub("[\(\[].*?[\)\]]", "", text)
    text = re.sub('([.,!?()])', r' \1 ', text) # padding
    text = re.sub('\s{2,}', ' ', text) # padding
    text = text.lower()
    text = re.sub(r'[^\w\s]','', text) 
    word_list = word_tokenize(text)
    stem_text = [lemmatizer.lemmatize(text) for text in word_list if not text.isdigit() if not text.lower() in stop_words]
    final_text = " ".join(stem_text)
    
    return final_text.strip()

def drop_na(df):
    df = df.applymap(lambda x: np.nan if x=="" else x)
    na_num = max(df.isnull().sum())
    if na_num == 0:
        print('\n-- No records contains null value, passed')  
    else:
        print('\n-- {} records have been dropped due to null values'.format(na_num))
        plt.figure(figsize=(15,5))
        sns.heatmap(df.isna().transpose(),
                    cmap="YlGnBu",
                    cbar_kws={'label': 'Missing Data'})
        #plt.savefig("visualizing_missing_data_with_heatmap_Seaborn_Python.png", dpi=100)     
        df.dropna(inplace=True)
        df.isnull().sum()
    return df

# Rendering Function

In [None]:
def generate_wc(title, path, words):
    #color_list=['#050505','#393939','#757474']
    #colormap=colors.ListedColormap(color_list)
    wc = WordCloud(  
        background_color='white',       
        max_words=100, 
        colormap='twilight', 
        max_font_size=1000
        #random_state=1
        #font_path = 'Times New Roman'
    )
    #print(words)
    wordcloud = wc.generate_from_frequencies(words)
    plt.figure(figsize=(20, 15))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title,fontsize=15)
    #plt.savefig('{}/wc-{}.png'.format(path,title[:12].replace('.','')))
    
def plot_bars(counts,words,title,path):
    font_size = 15
    fig, ax = plt.subplots(figsize=(15, 5))
    bars = ax.bar(counts, words,color=(0.2, 0.4, 0.6, 0.6),align='center', width=0.8)
    
    ax.set_title(title, fontsize=font_size,pad=50)
    ax.title.set_color("Black")
    ax.set_ylabel('TF', fontsize=font_size)
    
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    
    for rect in bars:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
            int(height),
        ha='center', va='bottom',fontsize=round(font_size*0.8))
    
    ax.tick_params(axis='x', labelsize=font_size*1.2)
    ax.tick_params(axis='y', labelsize=font_size*1.2)
    
    plt.xticks(rotation=90)
    leg = ax.legend()
    plt.show()
    fig.set_size_inches((15, 15), forward=False)
    #fig.savefig('{}/{}.png'.format(path,title[:12].replace('.','')))
    
def plotFreqDistByGroup(df, name = 'domain',group_label = None, freq_label= None):
    y_values,x_labels = list(), list()
    for x_label, y_count in df.groupby(group_label)[freq_label].count().items():
        y_values.append(y_count)
        x_labels.append(x_label)
    plot_bars(x_labels,y_values,'Total no of {} by {} on {}'.format(freq_label,group_label,name),'')