In [98]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [99]:
# Read the dataset
df = pd.read_csv("/kaggle/input/persona-csv/persona.csv")

In [100]:
df.head()

Unnamed: 0,PRICE,SOURCE,SEX,COUNTRY,AGE
0,39,android,male,bra,17
1,39,android,male,bra,17
2,49,android,male,bra,17
3,29,android,male,tur,17
4,49,android,male,tur,17


In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   PRICE    5000 non-null   int64 
 1   SOURCE   5000 non-null   object
 2   SEX      5000 non-null   object
 3   COUNTRY  5000 non-null   object
 4   AGE      5000 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 234.4+ KB


In [102]:
df.isnull().sum()

PRICE      0
SOURCE     0
SEX        0
COUNTRY    0
AGE        0
dtype: int64

In [103]:
df.describe()

Unnamed: 0,PRICE,AGE
count,5000.0,5000.0
mean,34.132,23.5814
std,12.464897,8.995908
min,9.0,15.0
25%,29.0,17.0
50%,39.0,21.0
75%,39.0,27.0
max,59.0,66.0


In [104]:
# Count the number of unique SOURCES and their frequencies
unique_sources = df["SOURCE"].nunique()
source_counts = df["SOURCE"].value_counts()
print("Number of unique sources:", unique_sources)
print("Source frequencies:")
print(source_counts)

Number of unique sources: 2
Source frequencies:
SOURCE
android    2974
ios        2026
Name: count, dtype: int64


In [None]:
# Count the number of unique PRICES
unique_prices = df["PRICE"].nunique()
print("Number of unique prices:", unique_prices)

In [55]:
# Count the number of sales for each PRICE
price_counts = df["PRICE"].value_counts()
print("Price frequencies:")
print(price_counts)

In [56]:
# Count the number of sales from each COUNTRY
country_sales = df.groupby("COUNTRY").value_counts()
print("Sales from each country:")
print(country_sales)

In [57]:
# Calculate the total revenue from sales by COUNTRY
country_revenue = df.groupby("COUNTRY")["PRICE"].sum()
print("Total revenue by country:")
print(country_revenue)

In [58]:
# Count the number of sales by SOURCE types
source_sales_counts = df["SOURCE"].value_counts()
print("Sales by source type:")
print(source_sales_counts)

In [59]:
# Calculate the average PRICE by COUNTRY
country_price_averages = df.groupby("COUNTRY")["PRICE"].mean()
print("Average price by country:")
print(country_price_averages)

In [60]:
# Calculate the average PRICE by SOURCE type
source_price_averages = df.groupby("SOURCE")["PRICE"].mean()
print("Average price by source type:")
print(source_price_averages)

In [61]:
# Calculate the average PRICE in the COUNTRY-SOURCE breakdown
country_source_price_avg = df.groupby(["COUNTRY","SOURCE"])["PRICE"].mean()
print("Average price by country and source type:")
print(country_source_price_avg)

In [62]:
# Calculate the average earnings in the breakdown of COUNTRY, SOURCE, SEX, and AGE
avg_earnings = df.groupby(["COUNTRY","SOURCE","SEX","AGE"])["PRICE"].mean()
print("Average earnings by country, source, sex, and age:")
print(avg_earnings)

In [89]:
# Sort the output by PRICE.
agg_df = avg_earnings.sort_values(ascending=False)
print("Sorted average earnings by country, source, sex, and age:")
print(agg_df)

In [90]:
# Convert index names to variable names
agg_df = agg_df.reset_index()
print("Data with index names converted to variable names:")
print(agg_df)

In [91]:
# Convert the AGE variable into a categorical variable and add it to agg_df
agg_df['AGE_GROUP'] = pd.cut(agg_df['AGE'], bins=[0, 18, 23, 30, 40, 70], labels=['0_18', '19_23', '24_30', '31_40', '41_70'])
print("Data with AGE grouped into categories:")
print(agg_df)

In [92]:
# Define new level-based customers and add them to the dataset as variables
agg_df["customers_level_based"] = (agg_df["COUNTRY"].str.upper() + "_" +
                                   agg_df["SOURCE"].str.upper() + "_" +
                                   agg_df["SEX"].str.upper() + "_" +
                                   agg_df["AGE_GROUP"].astype(str))
print("Data with level-based customers added:")
print(agg_df)

In [93]:
agg_df = agg_df.groupby("customers_level_based").agg({"PRICE": "mean"}).reset_index()
print("Grouped data with mean prices for level-based customers:")
print(agg_df)

In [95]:
# Segment new customers (e.g.USA_ANDROID_MALE_0_18)
agg_df["SEGMENT"] = pd.qcut(agg_df["PRICE"], 4, labels=["D", "C", "B", "A"])
print("Data with customer segments:")
print(agg_df)

In [96]:
segment_summary = agg_df.groupby("SEGMENT", observed=True).agg(
    count=("PRICE", "count"),
    mean_price=("PRICE", "mean"),
    min_price=("PRICE", "min"),
    max_price=("PRICE", "max")
)
print("Segment summary:")
print(segment_summary)

In [97]:
# Predict classification for new customers

# 33-year-old Turkish woman using ANDROID
new_user1 = "TUR_ANDROID_FEMALE_31_40"
result1 = agg_df[agg_df["customers_level_based"] == new_user1]
segment1 = result1["SEGMENT"].values[0] if not result1.empty else "Segment not found"
predicted_income1 = result1["PRICE"].values[0] if not result1.empty else "No income prediction"
print(f"New customer 1: Segment -> {segment1}, Expected income -> {predicted_income1}")

# 35-year-old French woman using IOS
new_user2 = "FRA_IOS_FEMALE_31_40"
result2 = agg_df[agg_df["customers_level_based"] == new_user2]
segment2 = result2["SEGMENT"].values[0] if not result2.empty else "Segment not found"
predicted_income2 = result2["PRICE"].values[0] if not result2.empty else "No income prediction"
print(f"New customer 2: Segment -> {segment2}, Expected income -> {predicted_income2}")

New customer 1: Segment -> A, Expected income -> 41.83333333333333
New customer 2: Segment -> C, Expected income -> 32.81818181818181
