## Lecture: Thinking Data
### In this lecture we are going to learn to think in terms of data & try to answer some important questions as an AI Engineer/ Data Scientist:
- What is clean & structured data?
- How to get insights from data?
- How to use data to improve products & build features (eg - productrecommendations)?

In [4]:
import json

In [81]:
#function to load the data
def load_data(filename):
    with open(filename, "r") as f:
        data = json.load(f)
    return data 

In [82]:
data = load_data("amazon_store_data.json")

In [98]:
#function to clean the data
def clean_data(data):
    text_to_num = {"one" : 1, "two" : 2, "three" : 3, "four": 4, "five": 5}
    
    unique_users = set()
    cleaned_data = []
    
    for user in data:
        #clean rating - consistent typing
        raw_rating = str(user['rating']).strip().lower()
        if(raw_rating in text_to_num):
            raw_rating = text_to_num[raw_rating]
        user['rating'] = raw_rating

        #handle missing values
        raw_age = user.get("age")
        if(raw_age == None):
            user["age"] = None

        #handle duplicate values - deduplication
        if(user['name'].strip() in unique_users):
            continue
        unique_users.add(user['name'])
        cleaned_data.append(user)
        
    return cleaned_data
        

In [99]:
data = clean_data(data)
print(data, len(data))

[{'name': 'Alice', 'rating': '5', 'feedback': 'Great product!!', 'age': '25'}, {'name': 'Bob', 'rating': '4', 'feedback': 'ok but late Delivery', 'age': '30'}, {'name': ' Charlie', 'rating': '2', 'feedback': 'BAD EXPERIENCE ', 'age': None}, {'name': 'Diana', 'feedback': 'Loved it!', 'rating': '5', 'age': '28'}, {'name': 'Eve', 'rating': '3.5', 'feedback': 'Average - could be better', 'age': '20'}] 5


In [106]:
#function to get insights from data
def get_insights(data):
    #avg rating
    tot_ratings = 0
    for user in data:
        tot_ratings += float(user['rating'])
    
    print(f"average product rating = {tot_ratings/len(data)}")

    #percentage users with poor rating
    poor_ratings = 0
    for user in data:
        if(float(user['rating']) < 3.0):
            poor_ratings += 1
    
    print(f"% users with poor rating = {poor_ratings/len(data)*100}%")

In [107]:
get_insights(data)

average product rating = 3.9
% users with poor rating = 20.0%


In [114]:
#product recommendation feature
def get_recommendations(data):
    recommendations = []
    
    for user in data:
        curr_recomm = {}
        curr_recomm['name'] = user['name']
        if(float(user['rating']) >= 4):
            curr_recomm['brand'] = "Apple"
        else:
            curr_recomm['brand'] = "Samsung"

        recommendations.append(curr_recomm)
    
    return recommendations
        

In [115]:
recommendations = get_recommendations(data)
print(recommendations)

[{'name': 'Alice', 'brand': 'Apple'}, {'name': 'Bob', 'brand': 'Apple'}, {'name': ' Charlie', 'brand': 'Samsung'}, {'name': 'Diana', 'brand': 'Apple'}, {'name': 'Eve', 'brand': 'Samsung'}]
