<a href="https://colab.research.google.com/github/ykim71/google_toxicity/blob/main/google_toxicity_update.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load File via GDrive



In [1]:
"""
Run this code and it will bring your Google account access permission. 
This gives Colab direct access for any files in your Google Drive.
OR, upload file from local
"""
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
"""
This code changes your working directory that Colab is on. I created and set 'toxicity' folder in my Google Drive. 
I can load and save data on the 'toxicity' folder in my Google Drive
"""
%cd drive/'MyDrive'/toxicity/

/content/drive/MyDrive/toxicity


## load your file to Colab

In [38]:
"""
load your file on Colab using following code. Replace 'sample_code_review.csv' with your file name. 
I set my data as 'sample_text' so you can replce it other name. 
"""

import pandas as pd

sample_text = pd.read_csv('sample_df.csv') # OR, /content/sample_code_review.csv


In [39]:
"""
take random 3 samples to see if data has loaded successfully; 'text' is the column that you want to analyze.
"""
#len(sample_text)
#sample_text = sample_text.sample(100)
sample_text.sample(3)

# Perpective API toxicity 



> Language Attributes: https://developers.perspectiveapi.com/s/about-the-api-attributes-and-languages



> API Request: https://developers.perspectiveapi.com/s/docs-get-started (note UT Google Account may not work; recommend using personal Google account for request)



In [5]:
"""
load packages/libraries
"""
from googleapiclient import discovery
from googleapiclient.errors import HttpError


In [6]:
"""
Enter your API here;
"""
API_KEY='AIzaSyAA12hq8oVxxjBzzyIUZ6GpOlNoMNxRNeA'


In [7]:
"""
Run this code if you want to analyze text data 4 measures of Toxicity, Likely to reject, Insult, and Identity Attact. 
See below comments for other variables and descriptions in detail.

"""
# variable descriptions: https://github.com/conversationai/perspectiveapi
# you can replace toxicity attributes here:
analyze_request = {
   'comment': { 'text': 'xx'}, # setting formats (id, text)
   'requestedAttributes': {'TOXICITY@6': {}, # see the actual variable name from the Perspective API page
                           'LIKELY_TO_REJECT@2': {}, 
                           'INSULT': {}, 
                           'IDENTITY_ATTACK': {} 
                           },
   'doNotStore': True, # for other settings, https://developers.perspectiveapi.com/s/about-the-api-methods
   'languages' : 'en'
}



In [None]:
# for a single text
import json

def incivility_measures(text):
  
  service = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  static_discovery=False,)
  
  analyze_request['comment']['text'] = text
  
  response = service.comments().analyze(body=analyze_request).execute()
  i = json.loads(json.dumps(response, indent=2))
  
  toxicity = i['attributeScores']['TOXICITY@6']['summaryScore']['value']
  reject = i['attributeScores']['LIKELY_TO_REJECT@2']['summaryScore']['value']
  insult = i['attributeScores']['INSULT']['summaryScore']['value']
  identity = i['attributeScores']['IDENTITY_ATTACK']['summaryScore']['value']
  
  print("text:" + text + "\ntoxicity:" + str(toxicity) + "\nreject:" + str(reject) + "\ninsult:" + str(insult) + "\nidentity:" + str(identity))


In [None]:

text = 'To all those thinking about voting for Trump, remember...  IN YOUR HEART... YOU KNOW HE\'S SHIT'
incivility_measures(text)

text = sample_text.text[1]
incivility_measures(text)


text:To all those thinking about voting for Trump, remember...  IN YOUR HEART... YOU KNOW HE'S SHIT
toxicity:0.94727516
reject:0.99898785
insult:0.71120167
identity:0.09659086
text:And with red tape slashed, scientists, medical researchers, our doctors are now rapidly developing not only treatments, but hopefully a vaccine. This unprecedented cooperation between the government, private industry, this will not forever change the way this country and the world will deal with future pandemics and crises, just like the travel ban.
toxicity:0.072587214
reject:0.06286466
insult:0.009431887
identity:0.0027008436


In [28]:
# run this code chunk
import csv
import codecs
import json
import time
import pandas as pd

def incivility_for_chunks(sample_text_file, text):
  
  start = time.time()

  service = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  static_discovery=False,)
  
  comments_toxicity_list = []
  comments_reject_list = []
  comments_insult_list = []
  comments_identity_list = []
  
  for i in text: 
    analyze_request['comment']['text'] = i
    
    try:
      response = service.comments().analyze(body=analyze_request).execute()
      i = json.loads(json.dumps(response, indent=2))
    
      comments_toxicity = i['attributeScores']['TOXICITY@6']['summaryScore']['value']
      comments_reject = i['attributeScores']['LIKELY_TO_REJECT@2']['summaryScore']['value']
      comments_insult = i['attributeScores']['INSULT']['summaryScore']['value']
      comments_identity = i['attributeScores']['IDENTITY_ATTACK']['summaryScore']['value']
        
    except HttpError:
      comments_toxicity = "error"
      comments_reject = "error"
      comments_insult = "error"
      comments_identity = "error"
      time.sleep(10.0) # added 10 second pause when error occurs
            
    comments_toxicity_list.append(comments_toxicity)
    comments_reject_list.append(comments_reject)
    comments_insult_list.append(comments_insult)
    comments_identity_list.append(comments_identity)
        
  temp = pd.DataFrame({'toxicity': comments_toxicity_list,
                       'reject': comments_reject_list, 
                       'insult': comments_insult_list, 
                       'attack': comments_identity_list})
  
  end=time.time()
  print("complete time: ", round(end -start, 2))

  return temp

In [40]:

"""
HERE is the code you need to chage. My data file name is 'sample_text' and the column name is 'text'. 
You can change your file and the text column name here.  
For other example, if your data name is 'df' and the text column name is 'comment', 
the first line of following code is supposed to be:

text = df.comment.values.tolist(): 

"""

text = sample_text.text.values.tolist()

"""
I switched the processs with a definition function for multiple running

add arguments into the definition below; (1) 'sample_text' is the datafile you have, 
'text' is the text data that you want to analyze here (we already assign text here)
For other example, if your data name is 'df' and the text column name is 'comment',

text = df.comment.values.tolist()

incivility_for_chunks(df, text)

"""
measures = incivility_for_chunks(sample_text, text)


complete time:  88.12


In [41]:
sample_text_merge = pd.concat([sample_text.reset_index(drop=True), measures], axis=1)


In [43]:
"""
see if measures have been computed successfully;
"""
sample_text_merge

Unnamed: 0,Unnamed: 0.1,text,toxicity,reject,insult,attack
0,264876,We all know by now that many of the CDCs early...,0.250108,0.297678,0.0632,0.003885
1,88283,And then in 1987 the first effective antiviral...,0.188476,0.144464,0.010211,0.002266
2,382241,Now the bad news. The bad news is the U.S. con...,0.192699,0.033217,0.034783,0.027012
3,247377,"So, you know, he shut down travel from China. ...",0.210472,0.335454,0.065123,0.016561
4,149172,Part of it like so much else with this pandemi...,0.067452,0.091916,0.007494,0.002063
...,...,...,...,...,...,...
95,257638,How many of your patients are you currently tr...,0.115194,0.319444,0.010591,0.003089
96,369835,"And, Doctor, other than the feeling of a sharp...",0.093193,0.650701,0.061452,0.010063
97,211257,"New developments today on the vaccine front, b...",error,error,error,error
98,409334,The United States has reached a sobering miles...,0.189502,0.192223,0.010135,0.005883


In [44]:
"""
check if there's error; errors can occur due to many reasons such as the API limit (in this case you may have to re-run those) 
or the Perpective API can't analyze the text because of different languages or so.

"""

sample_text_merge[sample_text_merge['toxicity']=="error"]

Unnamed: 0,Unnamed: 0.1,text,toxicity,reject,insult,attack
85,370617,If your conditions on the ground say your hosp...,error,error,error,error
87,151325,"I have to say the fact that Oxford, their monk...",error,error,error,error
89,230136,"Yes. I mean, this is -- thats a great point be...",error,error,error,error
90,426905,Governor Kevin Stitt of Ohio actually deployed...,error,error,error,error
91,290590,"And all you hear about is the red tape, the ho...",error,error,error,error
93,255542,Over 5 million immigrants in this country do p...,error,error,error,error
94,196333,"Yes. Im saying -- well, youre not going to get...",error,error,error,error
97,211257,"New developments today on the vaccine front, b...",error,error,error,error


In [45]:
"""
I'm going to re-run those errors and merge them to those completed;
(1) save complete cases separately
(2) select error cases and remove error columns (toxicity, etc)
"""
complete_cases = sample_text_merge[sample_text_merge['toxicity']!="error"]
error_cases = sample_text_merge[sample_text_merge['toxicity']=="error"]
error_cases.drop(['toxicity','reject', 'insult','attack'], axis=1, inplace=True)


text = error_cases.text.values.tolist()

error_cases_r1 = incivility_for_chunks(error_cases, text)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


complete time:  0.8


In [46]:
"""
check the output again; see if the error is due to the API limit or different language issues;
if there's error again, repeat the code above;

"""
error_cases_r1[error_cases_r1['toxicity']=='error']

Unnamed: 0,toxicity,reject,insult,attack


In [47]:
"""
merge all files

"""

sample_text_final = pd.concat([complete_cases, error_cases_r1])
sample_text_final.sort_index(inplace=True)


In [48]:
sample_text_final

Unnamed: 0,Unnamed: 0.1,text,toxicity,reject,insult,attack
0,264876.0,We all know by now that many of the CDCs early...,0.250108,0.297678,0.0632,0.003885
0,,,0.109022,0.175642,0.012149,0.002016
1,88283.0,And then in 1987 the first effective antiviral...,0.188476,0.144464,0.010211,0.002266
1,,,0.247909,0.201759,0.076503,0.011321
2,382241.0,Now the bad news. The bad news is the U.S. con...,0.192699,0.033217,0.034783,0.027012
...,...,...,...,...,...,...
92,495415.0,Californias first suspected community spread i...,0.261853,0.500069,0.013213,0.005513
95,257638.0,How many of your patients are you currently tr...,0.115194,0.319444,0.010591,0.003089
96,369835.0,"And, Doctor, other than the feeling of a sharp...",0.093193,0.650701,0.061452,0.010063
98,409334.0,The United States has reached a sobering miles...,0.189502,0.192223,0.010135,0.005883


In [49]:
"""
Exporting the results -- Save the data to your Google Drive OR the Colab environment. 
You can also find your data at the folder icon at the left side and download it.
"""

sample_text_final.to_csv('toxicity_done.csv')

In [None]:
# """
# For large text: 
# assign your column name (that contain text data you want to analyze) in the code

# Note that there's a limit to process depending on your API; added a 5 second pause when error occurs;
# """
# import csv
# import codecs
# import json
# import time
# import pandas as pd

# # setting attributes, can add more attiributes 

# service = discovery.build(
#   "commentanalyzer",
#   "v1alpha1",
#   developerKey=API_KEY,
#   discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
#   static_discovery=False,
# )

# start = time.time()

# comments_toxicity_list = []
# comments_reject_list = []
# comments_insult_list = []
# comments_identity_list = []

# """
# HERE is the code you need to chage. My data file name is 'sample_text' and the column name is 'text'. 
# You can change your file and the text column name here.  
# For other example, if your data name is 'df' and the text columne name is 'comment', 
# the first line of following code is supposed to be:

# for i in df.comment.values.tolist(): 

# Once you change this code, run this code block.
# """

# for i in sample_text.text.values.tolist(): 
#   analyze_request['comment']['text'] = i
  
#   try:
#     response = service.comments().analyze(body=analyze_request).execute()
#     i = json.loads(json.dumps(response, indent=2))
    
#     comments_toxicity = i['attributeScores']['TOXICITY@6']['summaryScore']['value']
#     comments_reject = i['attributeScores']['LIKELY_TO_REJECT@2']['summaryScore']['value']
#     comments_insult = i['attributeScores']['INSULT']['summaryScore']['value']
#     comments_identity = i['attributeScores']['IDENTITY_ATTACK']['summaryScore']['value']
        
#   except HttpError:
#     comments_toxicity = "error"
#     comments_reject = "error"
#     comments_insult = "error"
#     comments_identity = "error"
#     time.sleep(5.0) # added 5 second pause when error occurs
            
#   comments_toxicity_list.append(comments_toxicity)
#   comments_reject_list.append(comments_reject)
#   comments_insult_list.append(comments_insult)
#   comments_identity_list.append(comments_identity)
        
# sample_text = sample_text.join(pd.DataFrame({'toxicity': comments_toxicity_list, 
#                                              'reject': comments_reject_list, 
#                                              'insult': comments_insult_list, 
#                                              'attack': comments_identity_list}))

# end=time.time()
# print("complete time: ", round(end -start, 2))

complete time:  1114.99
