### Read data

In [1]:
import pandas as pd

df = pd.read_csv('../data/att_android_reviews.csv')
df.head()

Unnamed: 0,Comment Date,Ratings,Version,Category,Emotion,Topics,Review
0,2023/01/09,5,21.2.39,opinion,neutral,Bloatware,Trunoff Phone 📱 people Spam Google app
1,2023/01/09,5,22.8.62,reflect_rating,happy,NOT_APPLICABLE,Good app
2,2023/01/09,4,22.8.62,reflect_rating,happy,NOT_APPLICABLE,It's awesome
3,2023/01/09,5,21.12.75,reflect_rating,happy,NOT_APPLICABLE,Great
4,2023/01/08,1,20.12.40,opinion,satisfied,Cost,"We REALLY think this AT&T APP.. .., AT&T HAS B..."


In [2]:
df.Category.value_counts()

opinion           57
reflect_rating    51
bug               32
uncategorized      7
Name: Category, dtype: int64

### Clean data

In [21]:
df['Review'] = df['Review'].str.replace('"', '')

df = df[df['Category'] != 'uncategorized']

df = df[~df['Review'].isin(['i6k kill in mm6 2nd I k', 'Cec3 0 12th 0', "7t5y it'sy8 x uh 21o", 
                              "y imau s zSaaaaj I'mamigration uuuuuu qdddsftxt x, xcfrreey"])]

df = df[~pd.isnull(df['Version'])]
df.shape

(118, 8)

Let's convert this data into text blurbs.

In [22]:
def consolidate_user_feedback(r):
    user_text = f'A user provided a {r.Ratings} start review for app version {r.Version}. '
    if r.Category == 'opinion':
        user_text += f"Ther user gave the following opinion about {r.Topics}: '{r.Review}'. "
    elif r.Category == 'bug':
        user_text += f"The user reported a bug about {r.Topics}: '{r.Review}'."
    elif r.Category == 'reflect_rating':
        user_text += f"The user provided the following comment about the app: '{r.Review}'. "
    return user_text + f"Overall, the user is {r.Emotion} with the app."

df['user_text'] = df.apply(consolidate_user_feedback, axis=1)
df.head()

Unnamed: 0,Comment Date,Ratings,Version,Category,Emotion,Topics,Review,user_text
0,2023/01/09,5,21.2.39,opinion,neutral,Bloatware,Trunoff Phone 📱 people Spam Google app,A user provided a 5 start review for app versi...
1,2023/01/09,5,22.8.62,reflect_rating,happy,NOT_APPLICABLE,Good app,A user provided a 5 start review for app versi...
2,2023/01/09,4,22.8.62,reflect_rating,happy,NOT_APPLICABLE,It's awesome,A user provided a 4 start review for app versi...
3,2023/01/09,5,21.12.75,reflect_rating,happy,NOT_APPLICABLE,Great,A user provided a 5 start review for app versi...
4,2023/01/08,1,20.12.40,opinion,satisfied,Cost,"We REALLY think this AT&T APP.. .., AT&T HAS B...",A user provided a 1 start review for app versi...


In [24]:
df.iloc[10]['user_text']

"A user provided a 5 start review for app version 22.8.62. Ther user gave the following opinion about Service: 'Great Customer Service, Just taking care of the little quirks!'. Overall, the user is happy with the app."

In [None]:
# grab all reviews data into one variable
user_text_all = [' \n'.join(df['user_text'].values.tolist())]

In [32]:
len(user_text_all[0])

25555

We will use this text as the context for our questions.

### OpenAI setup

In [28]:
from dotenv import load_dotenv
from pathlib import Path
import os
import openai
COMPLETIONS_MODEL = 'text-davinci-003'

env_path = Path('..')/'.env'
load_dotenv(dotenv_path=env_path)

openai.api_key = os.getenv('OPENAI_API_KEY')

Note: Since this reviews data is not too large, we will use the first 12,000 characters from it for demonstration purposes. If your corpus is large, then you need to follow the approach outlined in `question_answering_using_proprietary_data.ipynb` notebook.

### Question Answering

In [35]:
prompt = f"""Answer the question as truthfully as possible using the provided text, and if the answer is not contained within the text below, say "I don't know"

Context:
{user_text_all[0][:12000]}

Q: What are the users most frustrated about?
A:"""

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

'Bugs, not being able to log in, not being able to back up into folders, and hack phone.'

In [36]:
prompt = f"""Answer the question as truthfully as possible using the provided text, and if the answer is not contained within the text below, say "I don't know"

Context:
{user_text_all[0][:12000]}

Q: What are the users most happy about?
A:"""

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

"The users are most happy about the app's service, cost, other features, and ease of use."

In [37]:
prompt = f"""Answer the question as truthfully as possible using the provided text, and if the answer is not contained within the text below, say "I don't know"

Context:
{user_text_all[0][:12000]}

Q: What is the general opinion about the app from users?
A:"""

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

'Most users are happy with the app.'

In [38]:
prompt = f"""Answer the question as truthfully as possible using the provided text, and if the answer is not contained within the text below, say "I don't know"

Context:
{user_text_all[0][:12000]}

Q: What is the average rating from the users who are frustrated with the app?
A:"""

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

'1 star'

In [39]:
prompt = f"""Answer the question as truthfully as possible using the provided text, and if the answer is not contained within the text below, say "I don't know"

Context:
{user_text_all[0][:12000]}

Q: What is the most common topic the users are commenting about?
A:"""

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

'Service'

In [40]:
prompt = f"""Answer the question as truthfully as possible using the provided text, and if the answer is not contained within the text below, say "I don't know"

Context:
{user_text_all[0][:12000]}

Q: What is the average rating among the users who are commenting about Serive?
A:"""

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

"I don't know."

In [41]:
prompt = f"""Answer the question as truthfully as possible using the provided text, and if the answer is not contained within the text below, say "I don't know"

Context:
{user_text_all[0][:12000]}

Q: Are the users who are commenting about Serive happy?
A:"""

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

'Yes, the users who are commenting about Service are happy.'

This demonstrates how we can use some proprietary text data with GPT-3 to get answers that it otherwise wouldn't know.