# Stack Exchange QA Crawler

## API Permission Settings 
1. log in your stackexchange account first

In [None]:
# put your application information here

client_id = '' # client_id
client_secret = '' # client_secret
key = '' # key



In [None]:
from requests_oauthlib import OAuth2Session
from pprint import pprint

redirect_uri = 'https://stackexchange.com/oauth/login_success'
scope = 'no_expiry'

oauth = OAuth2Session(client_id, redirect_uri=redirect_uri, scope=scope)

# pprint(vars(oauth))

authorization_url, state = oauth.authorization_url('https://stackexchange.com/oauth/dialog')

print("Please click in this link:", authorization_url)

In [None]:
# copy the link from the browser and put in the 'response_url' below
print("copy the link from the browser and put in the 'response_url' below")
response_url = input()
access_token = response_url.split('=')[1].split('&')[0]
access_token

## Initial Sentiment Analyzer 

In [None]:
import requests
from azure.cognitiveservices.language.textanalytics import TextAnalyticsClient
from msrest.authentication import CognitiveServicesCredentials
from pprint import pprint
import os
import re


class SentimentAnalyzer:
    def __init__(self):
        self.client = self.authenticateClient()

    # Creates client for sending requests to azure textanalytics api
    def authenticateClient(self):
        # TODO: change endpoint and key

        endpoint = ''
        key = ''

        credentials = CognitiveServicesCredentials(key)
        text_analytics_client = TextAnalyticsClient(endpoint=endpoint, credentials=credentials)
        return text_analytics_client

    # creates the 'documentsList' which will be sent to azure, for a set of answers.
    def createDocumentsList(self, answers):
        count = 1
        documentsList = []
        for answer in answers:
            count += 1
            document = self.createDocument(answer, count)
            documentsList.append(document)

        return documentsList

    # creates a document for one specific answer
    def createDocument(self, answer, count):
        body = self.removeMarkUps(answer["body"])
        # to get sentiment linked to answer_id use `str(answer["answer_id"])`` instead of str(count)`
        document = {
            "language": "en",
            "id": str(answer["answer_id"]),
            "text": body
        }
        return document

    # max documentList size is 1000 elements/ids and documents can have no more than 5120 characters
    # analyzes the sentiments of a list of documents (in our case answers), and returns the api response
    def getSentimentAnalysis(self, documents):
        response = self.client.sentiment(documents=documents)
        return response

    # analyzes one post and the corresponding answers. analyze means: "what is the sentiment score of the answers?"
    # input:
    #     the json of a stackexchange query, in the format s.t.:
    #          one item in the items list is one question with all the answers in an answer list
    # return: returns a list of posts with the corresponding sentiment analysis
    def analyzeFullPostWithAnswers(self, batch):
        multiplePostsWithAnalysis = []
        for item in batch["items"]:
            if "answers" in item:
                response = self.getSentimentAnalysis(
                    self.createDocumentsList(item["answers"]))
                postWithAnalysis = {
                    "question_id": item["question_id"],
                    "user_id": item["owner"]["user_id"],
                    "response_with_scores": self.azureResponseToJson(response)
                }
                multiplePostsWithAnalysis.append(postWithAnalysis)

        return multiplePostsWithAnalysis

    def analyzeFullAnswersWithAnswers(self, batch):
        multiplePostsWithAnalysis = []
        for item in batch["items"]:
            temp_list = []
            temp_list.append(item)
            response = self.getSentimentAnalysis(
                self.createDocumentsList(temp_list))
            postWithAnalysis = {
                "question_id": item["question_id"],
                "user_id": item["owner"]["user_id"],
                "response_with_scores": self.azureResponseToJson(response)
            }
            multiplePostsWithAnalysis.append(postWithAnalysis)

        return multiplePostsWithAnalysis

    # Formats the azure api response to a json object and returns it
    def azureResponseToJson(self, response):
        answer_list = []
        for document in response.documents:
            answer = {"answer_id": document.id,
                      "sentiment_score": "{:.2f}".format(document.score)}
            answer_list.append(answer)

        return answer_list

    # helper function to remove code from text-/answer-body
    def remove_code_from_posts(self, raw_html):
        clean_up = re.sub("<code>.*?</code>", "", raw_html)
        return clean_up

    # helper function to remove markup from text-/answer-body
    def removeMarkUps(self, markUpText):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', self.remove_code_from_posts(markUpText))
        return cleantext


### Combine Sentimental Analysis and Answer Crawler into Loop version

In [None]:
import json
import time
import requests
import pandas as pd

# please fill in the following parameter to use
input_filename = "user_data/user_male_sample.csv"
output_filename = "answers_male_sample.json"
question_num_per_user = 50 # the maximum for question we need is 50
users_num = 3 # for debug

cleaned_users = pd.read_csv(input_filename)
cleaned_users

BASEURL = "https://api.stackexchange.com/2.2/users/"

site = 'stackoverflow'
page = 1
pagesize = question_num_per_user
order = 'desc'
sort = 'activity'
filter_param = '!5-dTNUfY0JWMhwHvF.hA6K4v8DSD0wes1D14dD'

params = {
    "site" : site,
    "page" : page,
    "pagesize": pagesize,
    "order": order,
    "sort": sort,
    "filter": filter_param,
    "access_token" : access_token,
    "key" : key # change the key to your 
}

userSentimentAnalysisResults={}

for idx,row in cleaned_users.iterrows():
    user_id = str(row['user_id'])
    ADD_ID_URL = BASEURL + user_id + '/answers'
    r = requests.get(ADD_ID_URL, params=params)
    
    sleep_time = 0
    if 'backoff' in r.json().keys():
        sleep_time = r.json()['backoff']
        print('backoff:', sleep_time)
    time.sleep(sleep_time)
   
    sa = SentimentAnalyzer()
    analyzedWithSentiments = sa.analyzeFullAnswersWithAnswers(r.json())
    userSentimentAnalysisResults[user_id] = analyzedWithSentiments
    
    with open(output_filename, 'w') as fp:
        json.dump(userSentimentAnalysisResults, fp, indent=4)
    
    print(idx,row['user_id'],'quota_remaining', r.json()['quota_remaining'])
#     print(r.json())
#     print(analyzedWithSentiments)
#     if users_num == 0: # for debug
#         break          # for debug
#     users_num-=1       # for debug 

# write to json
# with open(output_filename, 'w') as fp:
#     json.dump(userSentimentAnalysisResults, fp, indent=4)
#         pprint(analyzedWithSentiments)
#         pprint(r.json())


In [None]:

with open("answers_male_sample_temp.json", 'w') as fp:
    json.dump(userSentimentAnalysisResults, fp, indent=4)
userSentimentAnalysisResults

In [None]:
import json


with open(output_filename, 'r') as file: # change the filename
    female_dict = json.load(file)

    response_list = []
    for k in female_dict:
        if len(female_dict[k]) >= 1:
            for el in female_dict[k]:
                response = el
                response_list.append(response)

    # print(response_list[0]['response_with_scores'][0]['sentiment_score'])

    sentiment_list = [] # saves all sentiment scores in a list
    i = -1
    for r in response_list:
        i+= 1
        if 'response_with_scores' in r:
            if len(response_list[i]['response_with_scores']) >= 1: # if the value of 'response_with_scores' is an empty list
                sentiment = response_list[i]['response_with_scores'][0]['sentiment_score']
                sentiment_list.append(float(sentiment))

    # print(sentiment_list)

    total = 0
    for score in sentiment_list:
        total += score
    
    avg = total/len(sentiment_list)
    print("average score: {}".format(round(avg, 3)))