In [2]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# importing the csv file as a DataFrame
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
df = pd.read_csv('persona.csv')
df.head()

Unnamed: 0,PRICE,SOURCE,SEX,COUNTRY,AGE
0,39,android,male,bra,17
1,39,android,male,bra,17
2,49,android,male,bra,17
3,29,android,male,tur,17
4,49,android,male,tur,17


In [4]:
# Checking DataFrame
print("##################### Shape #####################")
print(df.shape)
print("##################### Types #####################")
print(df.dtypes)
print("##################### Head #####################")
print(df.head(5))
print("##################### Tail #####################")
print(df.tail(5))
print("##################### NA #####################")
print(df.isnull().sum())
print("##################### Quantiles #####################")
print(df.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

##################### Shape #####################
(5000, 5)
##################### Types #####################
PRICE       int64
SOURCE     object
SEX        object
COUNTRY    object
AGE         int64
dtype: object
##################### Head #####################
   PRICE   SOURCE   SEX COUNTRY  AGE
0     39  android  male     bra   17
1     39  android  male     bra   17
2     49  android  male     bra   17
3     29  android  male     tur   17
4     49  android  male     tur   17
##################### Tail #####################
      PRICE   SOURCE     SEX COUNTRY  AGE
4995     29  android  female     bra   31
4996     29  android  female     bra   31
4997     29  android  female     bra   31
4998     39  android  female     bra   31
4999     29  android  female     bra   31
##################### NA #####################
PRICE      0
SOURCE     0
SEX        0
COUNTRY    0
AGE        0
dtype: int64
##################### Quantiles #####################
        count     mean        std  

In [24]:
# Let's look at the data under demographic categories:
agg_df = df.groupby(['SOURCE', 'COUNTRY', 'SEX', 'AGE']).agg({'PRICE':'mean'}).sort_values('PRICE', ascending=False)
agg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,PRICE
SOURCE,COUNTRY,SEX,AGE,Unnamed: 4_level_1
android,fra,female,24,59.0
android,usa,male,36,59.0
android,bra,male,46,59.0
ios,usa,male,32,54.0
android,deu,female,36,49.0
android,...,...,...,...
android,fra,male,18,19.0
ios,tur,male,47,19.0
android,can,female,27,19.0
ios,usa,female,38,19.0


In [25]:
# As we can see, we are able to build a value patern acording to customers demogrophic informations. For example we can build a segmantation for the customers whom are from USA, female and aged between 24 - 30. With this segmentation we can have a general idea of customers spends.

In [26]:
# Resseting the indexes of data:
agg_df = agg_df.reset_index()

In [27]:
agg_df

Unnamed: 0,SOURCE,COUNTRY,SEX,AGE,PRICE
0,android,fra,female,24,59.0
1,android,usa,male,36,59.0
2,android,bra,male,46,59.0
3,ios,usa,male,32,54.0
4,android,deu,female,36,49.0
...,...,...,...,...,...
343,android,fra,male,18,19.0
344,ios,tur,male,47,19.0
345,android,can,female,27,19.0
346,ios,usa,female,38,19.0


In [28]:
# Creating a category for age:
agg_df['AGE_CAT'] = agg_df['AGE'].apply(lambda x: '0_18' if x <= 18
                                   else('19_23' if x == 19 | x <= 23
                                   else('24_30' if x == 24 | x <= 30
                                   else('31_40' if x == 31 | x <= 40
                                   else'41_70'))))


In [29]:
agg_df

Unnamed: 0,SOURCE,COUNTRY,SEX,AGE,PRICE,AGE_CAT
0,android,fra,female,24,59.0,24_30
1,android,usa,male,36,59.0,41_70
2,android,bra,male,46,59.0,41_70
3,ios,usa,male,32,54.0,41_70
4,android,deu,female,36,49.0,41_70
...,...,...,...,...,...,...
343,android,fra,male,18,19.0,0_18
344,ios,tur,male,47,19.0,41_70
345,android,can,female,27,19.0,24_30
346,ios,usa,female,38,19.0,41_70


In [30]:
# Merging the categories for segmentation:
agg_df['customers_level_based'] = agg_df[['COUNTRY', 'SOURCE', 'SEX', 'AGE_CAT']].agg('_'.join, axis = 1)

# Dropping the categories after merge
agg_df.drop(['COUNTRY', 'SOURCE', 'SEX', 'AGE_CAT', 'AGE'], axis = 1, inplace = True)

In [31]:
agg_df

Unnamed: 0,PRICE,customers_level_based
0,59.0,fra_android_female_24_30
1,59.0,usa_android_male_41_70
2,59.0,bra_android_male_41_70
3,54.0,usa_ios_male_41_70
4,49.0,deu_android_female_41_70
...,...,...
343,19.0,fra_android_male_0_18
344,19.0,tur_ios_male_41_70
345,19.0,can_android_female_24_30
346,19.0,usa_ios_female_41_70


In [32]:
# Now we have the appropriate dataset for segmentation. 
# Lets separate our data into the segments of ['D', 'C', 'B', 'A']:
agg_df['SEGMENT'] = pd.qcut(agg_df['PRICE'],  4, labels=['D', 'C', 'B', 'A'])

agg_df

Unnamed: 0,PRICE,customers_level_based,SEGMENT
0,59.0,fra_android_female_24_30,A
1,59.0,usa_android_male_41_70,A
2,59.0,bra_android_male_41_70,A
3,54.0,usa_ios_male_41_70,A
4,49.0,deu_android_female_41_70,A
...,...,...,...
343,19.0,fra_android_male_0_18,D
344,19.0,tur_ios_male_41_70,D
345,19.0,can_android_female_24_30,D
346,19.0,usa_ios_female_41_70,D


In [33]:
# And let's check the segments according to their price values:
agg_df.groupby('SEGMENT').agg(['sum', 'mean', 'max'])

  agg_df.groupby('SEGMENT').agg(['sum', 'mean', 'max'])


Unnamed: 0_level_0,PRICE,PRICE,PRICE
Unnamed: 0_level_1,sum,mean,max
SEGMENT,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
D,2375.32585,27.302596,31.105263
C,3128.667165,32.933339,34.0
B,2870.329792,35.43617,37.0
A,3521.952577,41.434736,59.0


In [35]:
# As we can see, we have a rule-based segmantation acording to our data set and this segmentation provide us to have guesses about our customers behaviours.

# For example lets have guess about a turkish woman whom aged as 33 and using Android and a french woman whom aged 35 and using IOS:

# Enterig data for our new customers
user_country = ['tur', 'fra']
user_source = ['android', 'ios']
user_sex = ['female', 'female']
user_age_flag = ['31_40', '31_40']

# Making a tuple with the new customers data
new_users_tuples = list(zip(user_country, user_source, user_sex, user_age_flag))
new_users = ['_'.join(tuple) for tuple in new_users_tuples]

#Guessing:
for new_user in new_users:
    print(agg_df[agg_df.customers_level_based == new_user])

        PRICE     customers_level_based SEGMENT
35  40.666667  tur_android_female_31_40       A
         PRICE customers_level_based SEGMENT
222  32.636364  fra_ios_female_31_40       C


In [None]:
# As we can see turk customer is included in segment "A" which is expected to make a spend close to $40 while french customer is included in segment C which is expected to make a spend close to $32.