# Exploratory Data Extraction

## Dependencies

In [7]:
!pip install numpy pandas matplotlib nltk  seaborn scikit-learn --quiet

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import nltk

from tqdm import trange
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
import warnings
warnings.filterwarnings('ignore')
nltk.download('omw-1.4', quiet=True)
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (17,7)
plt.rcParams['font.size'] = 18

## Macros

In [5]:
roi_column = "Content"

req_columns = ['ID', 'URL', 'Source', 'Type', 'CreatedAt', 'Language', 'Record Sentiment', 'Tracked Keywords', 'Reasons', 'Content']

## Helper Class and Functions

In [3]:
class PATH:
    data_path = "./enterpret.csv"

## Loading Data

In [6]:
data = pd.read_csv(PATH.data_path)[req_columns]
data.head()

Unnamed: 0,ID,URL,Source,Type,CreatedAt,Language,Record Sentiment,Tracked Keywords,Reasons,Content
0,e0e5cedb-2da1-5476-bfef-366eebc71656,https://dashboard.enterpret.com/enterpretinc/r...,Slack,RecordTypeConversation,2024-01-12T22:28:45Z,eng,NEGATIVE,,Issue With Decreased Feedback Volume From Sources,Agent: Mike McNasby jil It looks like the volu...
1,a879cfc1-120c-5a69-b059-5820f08abae3,https://dashboard.enterpret.com/enterpretinc/r...,Gong,RecordTypeAudioRecording,2024-01-12T21:39:45Z,eng,,"Feedback, Integration, Quantify, Reporting, Sl...","Ability To Filter Information In Zendesk, Help...",<AUDIO_CONTENT>
2,69949ac0-6280-5a97-be88-315985fe2cbc,https://dashboard.enterpret.com/enterpretinc/r...,Slack,RecordTypeConversation,2024-01-12T20:57:40Z,eng,NEUTRAL,Schedule,,"User: Adding Daniela González, Emma's ExecOps ..."
3,7396e171-235d-5cd4-af82-7b8d4c70ce76,https://dashboard.enterpret.com/enterpretinc/r...,Salesforce - Opportunity,RecordTypeSurvey,2024-01-12T20:57:28Z,eng,NEGATIVE,Feedback,"Issue With Low Volume, Issue With Feedback Goi...","Closed Lost Notes\nNot enough volume, not enou..."
4,6794583a-f498-5904-925e-08b826581709,https://dashboard.enterpret.com/enterpretinc/r...,Gong,RecordTypeAudioRecording,2024-01-12T20:01:16Z,eng,,"Product Design, Feedback, Enterpret, Survey, D...","Ability To Get More Detailed Feedback, Ability...",<AUDIO_CONTENT>


In [7]:
data.shape, data.columns

((904, 10),
 Index(['ID', 'URL', 'Source', 'Type', 'CreatedAt', 'Language',
        'Record Sentiment', 'Tracked Keywords', 'Reasons', 'Content'],
       dtype='object'))

In [8]:
data[roi_column].value_counts(dropna=False)[:3]

Content
<AUDIO_CONTENT>                                    507
NaN                                                 44
Agent: Saurabh Arora has joined the channel\n\n      3
Name: count, dtype: int64

### OBservation - 1 
504 Audoi Content + 44 Null content -   cannot be processed due to being NaN and having Audio Content.

In [9]:
data[data[roi_column] == "<AUDIO_CONTENT>"]["Tracked Keywords"]

1      Feedback, Integration, Quantify, Reporting, Sl...
4      Product Design, Feedback, Enterpret, Survey, D...
5      Reporting, Search, Reason, Design, Summary, Ch...
6      Enterpret, Zendesk, Data, Feedback, Chat, Emai...
7      Enterpret, Customer, Feedback, Channel, Slack,...
                             ...                        
895                                                  NaN
896    Enterpret, Customer, Feedback, Taxonomy, Sales...
898    Tickets, Sentiment, Product Design, Feedback, ...
901    Feedback, Reason, Enterpret, Product Design, T...
902    Onboarding, Tickets, Product Design, Enterpret...
Name: Tracked Keywords, Length: 507, dtype: object

### Observation - 2 

## Preprocessing

### Dropping null and audio content ones

In [10]:
filtered_data = data[(data[roi_column] != "<AUDIO_CONTENT>") & (data[roi_column].notnull())]

In [11]:
filtered_data.shape

(353, 10)

In [12]:
filtered_data.columns

Index(['ID', 'URL', 'Source', 'Type', 'CreatedAt', 'Language',
       'Record Sentiment', 'Tracked Keywords', 'Reasons', 'Content'],
      dtype='object')

In [13]:
filtered_data.to_csv("raw_data.csv", index=False)

In [14]:
data = pd.read_csv("./filtered_output.csv")

In [18]:
data.columns

Index(['ID', 'Conversation', 'Metadata'], dtype='object')