In [1]:
!pip install --upgrade "ibm-watson>=4.5.0"

Requirement already up-to-date: ibm-watson>=4.5.0 in /anaconda3/lib/python3.6/site-packages (4.5.0)


In [2]:
import pandas as pd
import csv
from ibm_watson import ToneAnalyzerV3
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
import numpy as np
import time

In [3]:
headers = ["timestamp","level","country","is_online",
           "first_survey","agree","primary_mode","preferred_mode",
           "why_mode","platforms_used","methods_used",
           "techniques_outside","remote_enjoy","remote_motivated",
           "remote_satisfied","remote_engaging","remote_distracted",
           "remote_questions","remote_changes","prior_enjoy",
           "prior_motivated","prior_satisfied","prior_engaging",
           "prior_distracted","prior_questions","prior_changes",
           "preference","why_preference"]

# Read in the raw data
full_data = pd.read_csv("raw_data.csv", skiprows=[0], names=headers, na_values="?")
remote_survey_q = ["remote_enjoy","remote_motivated",
                   "remote_satisfied","remote_engaging",
                   "remote_distracted","remote_questions"]
change_survey_q = ["change_enjoy","change_motivated",
                   "change_satisfied","change_engaging",
                   "change_distracted","change_questions"]
data = full_data.drop(["timestamp","first_survey","agree"], axis=1)
likert_dict = {"Strongly Disagree": 1, "Disagree": 2, "Neutral": 3, 
               "Agree": 4, "Strongly Agree": 5}
likert_code = {"remote_enjoy":       likert_dict,
               "remote_motivated":   likert_dict,
               "remote_satisfied":   likert_dict,
               "remote_engaging":    likert_dict,
               "remote_distracted":  likert_dict,
               "remote_questions":   likert_dict,
               "prior_enjoy":       likert_dict,
               "prior_motivated":   likert_dict,
               "prior_satisfied":   likert_dict,
               "prior_engaging":    likert_dict,
               "prior_distracted":  likert_dict,
               "prior_questions":   likert_dict}
data.replace(likert_code, inplace=True)
modes = {"primary_mode":   {"Live classes (ie: Zoom, Google Meet etc.)": "live",
                            "Uploaded or emailed Materials": "upload",
                            "Recorded Lectures": "recorded",
                            "Discussion forums/chats": "chat"},
         "preferred_mode":   {"Live classes (ie: Zoom, google meet etc.)": "live",
                            "Uploaded or emailed Materials": "upload",
                            "Recorded Lectures/Videos": "recorded",
                            "Discussion forums/chats": "chat"}}

data.replace(modes, inplace=True)


In [4]:
data.shape

(10561, 25)

In [5]:
duplicate = data[data.duplicated()]
print('# of duplicates: ', len(duplicate))
print('Valid entries: ', duplicate[duplicate['primary_mode'].isna()==False].index)
print('# of duplicates for valid entries: ', len(duplicate[duplicate['primary_mode'].isna()==False].index))
print('NaN entries: ', duplicate[duplicate['primary_mode'].isna()].index)
print('# of duplicates for NaN entries: ', len(duplicate[duplicate['primary_mode'].isna()]))
data = data.drop(duplicate.index, axis=0)
data.shape

# of duplicates:  106
Valid entries:  Int64Index([ 1143,  2304,  2966,  2968,  3441,  4090,  4550,  4778,  5406,
             5710,  6885,  7130,  7140,  7191,  7536,  7537,  7767,  8411,
             9016,  9314,  9371, 10154, 10186, 10447, 10472, 10531],
           dtype='int64')
# of duplicates for valid entries:  26
NaN entries:  Int64Index([   20,    56,   124,   208,   322,   416,   431,   587,   773,
              995,  1018,  1027,  1754,  1841,  2083,  2439,  2466,  2754,
             2892,  3077,  3403,  3552,  3728,  3876,  4093,  4161,  4232,
             4386,  4576,  4648,  4735,  4933,  5014,  5048,  5196,  5209,
             5226,  5346,  5583,  5658,  5719,  5908,  5940,  5992,  6067,
             6155,  6226,  6383,  6416,  6571,  6576,  6672,  6689,  7496,
             7607,  7721,  7860,  7986,  8037,  8080,  8145,  8260,  8344,
             8461,  8529,  9037,  9063,  9099,  9127,  9259,  9291,  9479,
             9789,  9800,  9848,  9972, 10064, 10151, 10446, 104

(10455, 25)

In [6]:
qualitative_data = data.loc[:,['preferred_mode','why_mode', 'preference', 'why_preference', 'remote_changes', 'prior_changes']]
qualitative_data['mode_tone'] = np.zeros((data.shape[0], 1))
qualitative_data['preference_tone'] = np.zeros((data.shape[0], 1))
qualitative_data['remote_changes_tone'] = np.zeros((data.shape[0], 1))
qualitative_data['prior_changes_tone'] = np.zeros((data.shape[0], 1))

In [7]:
authenticator = IAMAuthenticator('MPBO7gcnQsN_fUW4RBCc3Gs2JV15sw5RqJLpTOC9zMwi')
tone_analyzer = ToneAnalyzerV3(
    version='2017-09-21',
    authenticator=authenticator
)

tone_analyzer.set_service_url('https://api.us-south.tone-analyzer.watson.cloud.ibm.com')

In [8]:
def sentiment(data):
    if not pd.isna(data):
        json_output = tone_analyzer.tone(data, content_type='text/plain').result['document_tone']['tones']
        return json_output
    else:
        return np.NaN

In [9]:
def sentiment_applied(data, col):
    start = time.time()
    sentiments = data[col].apply(sentiment)
    end = time.time()
    return end-start, sentiments

In [10]:
print('Data shape: ', qualitative_data.shape)

Data shape:  (10455, 10)


In [13]:
qualitative_data['why_preference'].head()

0    Easier to ask for help, easier to understand t...
1    In person more feel for the subject and can re...
2                          Motivated to work in person
3                                              Friends
4                                    Easier to debate 
Name: why_preference, dtype: object

In [19]:
qualitative_data['why_preference'].shape

(10455,)

In [20]:
qualitative_data['why_preference'].iloc[-2]

"it's more engaging."

In [21]:
sentiment(qualitative_data['why_preference'].iloc[-2])

[{'score': 0.880435, 'tone_id': 'joy', 'tone_name': 'Joy'}]

In [16]:
sentiment(qualitative_data['why_preference'].iloc[1])

[{'score': 0.730335, 'tone_id': 'analytical', 'tone_name': 'Analytical'}]

In [31]:
duration1, sentiment_why_mode = sentiment_applied(qualitative_data, 'why_mode')

In [35]:
from google.colab import files
sentiment_why_mode.to_csv('sentiment_why_mode.csv')
files.download('sentiment_why_mode.csv')

In [None]:
duration2, sentiment_why_preference = sentiment_applied(qualitative_data, 'why_preference')

In [None]:
duration3, sentiment_remote_change = sentiment_applied(qualitative_data, 'remote_changes')

In [None]:
duration4, sentiment_prior_change = sentiment_applied(qualitative_data, 'prior_changes')