## Import Libraries

In [None]:
########################################
# importing packages
########################################
import pandas            as pd                          # data science essentials
import matplotlib.pyplot as plt                         # fundamental data visualization
import seaborn           as sns                         # enhanced visualizations
from sklearn.preprocessing import StandardScaler        # standard scaler
from sklearn.decomposition import PCA                   # pca
from scipy.cluster.hierarchy import dendrogram, linkage # dendrograms
from sklearn.cluster         import KMeans              # k-means clustering

sns.set()

## Helper Functions

In [None]:
def columnNamer(cols, isBigFive, isHultDna):
    '''
    This function renames columns depending if they are from the big 5 or 
    from Hult DNA. The format of the columns is <B5 or DNA>_Q<n>__<NAME_OF_COLUMN>.
    
    ----------------------
    Params
    ----------------------
    
    cols: Column name list.
    isBigFive: Boolean. If True, function will assume that all the columns are from the big five. 
    isHuldDNA: Boolean. If True, function will assume that all the columns are from the hult dna.
    
    '''
    

    cols = cols.str.lower()
    cols = cols.str.replace(' ', '_') # Fill spaces with underscore 
    cols = cols.str.replace("'", '') # Remove quotes from questions


    if isBigFive == True:
        _cols =  []
        
        for i in range(len(cols)):
#             _cols.append(f'B5_Q{i+1}__{cols[i]}')
            _cols.append(f'B5__{cols[i]}')
        return _cols

    elif isHultDna == True: 

        _cols =  []

        for i in range(len(cols)):
            _cols.append(f'DNA_Q{i+1}__{cols[i]}')
        return _cols 

    else:
        print('Call Manwe, Morgoth introduced a bug in this function')
        
        
def inertia_plot(data, max_clust = 50):
    """
PARAMETERS
----------
data      : DataFrame, data from which to build clusters. Dataset should be scaled
max_clust : int, maximum of range for how many clusters to check interia, default 50
    """

    ks = range(1, max_clust)
    inertias = []


    for k in ks:
        # INSTANTIATING a kmeans object
        model = KMeans(n_clusters = k)


        # FITTING to the data
        model.fit(data)


        # append each inertia to the list of inertias
        inertias.append(model.inertia_)



    # plotting ks vs inertias
    fig, ax = plt.subplots(figsize = (12, 8))
    plt.plot(ks, inertias, '-o')


    # labeling and displaying the plot
    plt.xlabel('number of clusters, k')
    plt.ylabel('inertia')
    plt.xticks(ks)
    plt.show()


########################################
# scree_plot
########################################
def scree_plot(pca_object, export = False):
    # building a scree plot

    # setting plot size
    fig, ax = plt.subplots(figsize=(10, 8))
    features = range(pca_object.n_components_)


    # developing a scree plot
    plt.plot(features,
             pca_object.explained_variance_ratio_,
             linewidth = 2,
             marker = 'o',
             markersize = 10,
             markeredgecolor = 'black',
             markerfacecolor = 'grey')


    # setting more plot options
    plt.title('Scree Plot')
    plt.xlabel('PCA feature')
    plt.ylabel('Explained Variance')
    plt.xticks(features)

    if export == True:
    
        # exporting the plot
        plt.savefig('top_customers_correlation_scree_plot.png')
        
    # displaying the plot
    plt.show()

## Domain Knowledge Research


### Nemid & Pastva (2013)

 - Big Five personality traits did not differentiate between Mac and PC owners. Students overall rated Macs higher on various product attributes (attractive style, cool, youthful, and exciting) and PCs higher on reasonable price and good for gaming.
 
 - PC owners placed greater importance on cost as a determinant of brand choice, whereas Mac owners placed greater emphasis on style. 
 
 - Personality traits may have more nuanced effects on brand choices, as shown by relationships between Neuroticism and greater importance placed on cost and lesser importance placed on ease of use. 
     - **Personality Traits are more important in the brand choice!**
     - More neuroticism, more importance in cost and less importance in ease of use
     
- Openness to Experience was associated with greater importance placed on reliability and lesser importance placed on style.
    - **More openness to experience, more importance in reliability and less importance in style**
    
    
https://www.researchgate.net/publication/259540094_I'm_a_Mac_versus_I'm_a_PC_Personality_Differences_between_Mac_and_PC_Users_in_a_College_Sample

<br> 

___________

<br>

### PC World

<br>


- People who purchase Macs fall into what the branding company calls the "Openness 5" personality category -- which means they are more liberal, less modest and more assured of their own superiority than the population at large.
-  People from Openness 5 seek rich, varied and novel experiences, according to the company, and believe that imagination and intellectual curiosity are as important to life as more rational or pragmatic endeavors.
    - Hypothesis: Mac users will have more opennes to new adventures. 

## Data Exploration

### General Data Set Exploration

- 1 missing value in ethnicity
- 1 participant said MAC instead of Macbook

In [None]:
# Read Data
my_df = pd.read_excel('Survey_Data_Final_Exam.xlsx')

my_df.loc[:,'What laptop do you currently have?'].value_counts()

### Dividing Columns Big5 vs Hult DNA

#### Changing Column Names

In [None]:
# Renaming Demographic Data

# Creating a dictionary
columns_to_change = dict(zip([
        'What laptop do you currently have?',
        'What laptop would you buy in next assuming if all laptops cost the same?',
        'What program are you in?', 
        'What is your age?', 
        'Gender',
        'What is your nationality? ', 
        'What is your ethnicity?'], 
        
    ['Current laptop', 
        'Next laptop', 
        'Program', 
        'Age', 
        'Gender',
        'Nationality', 
        'Ethnicity'
    ]
))

my_df.rename(columns_to_change,
                   axis='columns',inplace=True)

# Renaming Big 5 Data 
# Separated Big 5 Data set
big_five = my_df.iloc[:, 1:51]  # Subset big five-related columns

# Change Column Names
big_five.columns = columnNamer(big_five.columns,
                               isBigFive=True,
                               isHultDna=False)


# Creating demographic Variables 

demographics = my_df.loc[:,  ['Current laptop', 
        'Next laptop', 
        'Program', 
        'Age', 
        'Gender',
        'Nationality', 
        'Ethnicity'
    ]]

### Grouping Age by Age-Cohorts

In [None]:
placeholder_lst = []

for age in my_df['Age']:
    
    if age <= 20:
        age_range = '<20'
    
    elif age > 20 and age <= 25:
        age_range = '20-25'
    
    elif age > 25 and age <= 30:
        age_range = '26-30'
        
    elif age > 30 and age <= 35:
        age_range = '31-35'
    
    elif age > 36 and age <= 40:
        age_range = '36-40'
        
    elif age > 40:
        age_range = '>40'
        
    placeholder_lst.append(age_range)
    
my_df['age_range'] = placeholder_lst

### Nationalities Data Cleaning

#### Grouping similar Nationalities

In [None]:
placeholder_lst = []

# create a list for nationality change
nation_change = [[['china'],'chinese'],
                 [['peru'], 'peruvian'],
                 [['mexico'], 'mexican'],
                 [['usa'], 'american'],
                 [['russia'], 'russian'],
                 [['ecuador'], 'ecuadorian'],
                 [['brazil'], 'brazilian'],
                 [['nigeria'], 'nigerian'],
                 [['korea','republicofkorea','southkorea'],'korean'],
                 [['spain'],'spanish'],
                 [['indonesia'],'indonesian'],
                 [['germany'],'german'],
                 [['colombia'], 'colombian'],
                 [['taiwan', 'taiwan(roc)'], 'taiwanese'],
                 [['japan'], 'japanese'],
                 [['canada'], 'canadian'],
                 [['philippines'], 'filipino'],
                 [['thailand'], 'thai'],
                 [['india'], 'indian'],
                 [['czechrepublic'], 'czech'],
                 [['belgium'], 'belgian'],
                 [['english'], 'british'],
                 [['ghana'], 'ghanaian'],
                 [['.', 'hispanic'], 'prefernottoanswer'],
                 [['italianandspanish', 'german/american', 'french/brazilian', 'british,indian', 'caribbean-american'], 'multi-ethnic'],
                 [['costarica'], 'costarrican'],
                 [['congolese(drcongo)'], 'congolese'],
                 [['venezuela'],'venezuelan'],
                 [['dominicanrepublic'],'dominican']
                
                ]

# create a for loop
for nationality in my_df['Nationality']:
    
    # remove " " and "."
    nationality = nationality.lower().replace(" ","").replace(".","")
    
    # create a loop
    for old_nat, new_nat in nation_change:
    
        # create if statement to check if nationality is wrong
        if nationality in old_nat:
            nationality = new_nat
    
    # append the correct nationality to a list
    placeholder_lst.append(nationality)

# create a new column
my_df['Nationality2'] = placeholder_lst

#### Grouping Nationalities by continent

In [None]:
# create a placeholder list
placeholder_lst = []

# create a for loop
for nationality in my_df['Nationality2']:
    
    # check if nationality is in Asia
    if nationality in ['indian','chinese','taiwanese','filipino','korean',
                       'thai','indonesian','vietnamese','japanese','palestinian',
                       'kyrgyz','pakistani','bangladeshi','iran','malaysia']:
        nationality = 'Asian'
    
    # check if nationality is in Europe
    elif nationality in ['german','russian','italian','spanish','norwegian',
                         'turkish','belgian','czech','british','swiss',
                         'ukrainian','portuguese','belarus','dutch','poland',
                         'armenia','dutch']:
        nationality = 'European'
    
    # check if nationality is in North America
    elif nationality in ['mexican','american','canadian','dominican','costarrican',
                         'panama','guatemalan','elsalvador','honduran']:
        nationality = 'North American'
    
    # check if nationality is in South America
    elif nationality in ['peruvian','colombian','brazilian','ecuadorian',
                         'venezuelan']:
        nationality = 'South American'
    
    # check if nationality is in Africa
    elif nationality in ['nigerian','kenyan','congolese','ghanaian','ugandan',
                         'mauritius','southafrican','cameroon']:
        nationality = 'Africa'
    
    # check if nationality is multi-ethnic
    elif nationality == 'multi-ethnic':
        nationality = 'multi-ethnic'
    
    # put 'prefernottoanswer' for the remaining
    else: 
        nationality = 'prefernottoanswer'
    
    # append the new nationality group to a list
    placeholder_lst.append(nationality)

# create a new column
my_df['Nationality_continent'] = placeholder_lst

### Creating factors big5

according to this version of the 

#### Fixing Reversed Items 

In [None]:
#### Factor I (Surgency or Extraversion)

fac1_pos = [
    "am the life of the party",
    "Feel comfortable around people",
    "Start conversations", 
    "Talk to a lot of different people at parties",
    "Don't mind being the center of attention"
    ]

fac1_neg = [ 
    "Don't talk a lot", 
    "Keep in the background", 
    "Have little to say",
    "Don't like to draw attention to myself", 
    "Am quiet around strangers"
]

## Factor II (Agreeableness)

fac2_pos = [
    "Am interested in people", 
    "Sympathize with others' feelings",
    "Have a soft heart", 
    "Take time out for others", 
    "Feel others' emotions",
    "Make people feel at ease"
    ]

fac2_neg = [
    "Am not really interested in others", 
    "Insult people",
    "Am not interested in other people's problems",
    "Feel little concern for others"
]


## Factor III (Conscientiousness)

fac3_pos = [
    "Am always prepared", 
    "Pay attention to details",
    "Get chores done right away",
    "Like order", 
    "Follow a schedule",
    "Am exacting in my work"
]

fac3_neg = [
    "Leave my belongings around", 
    "Make a mess of things",
    "Often forget to put things back in their proper place",
    "Shirk my duties"
]

##Factor IV (Emotional Stability)

fac4_pos = [
            "Am relaxed most of the time", 
            "Seldom feel blue"
]

fac4_neg = [
    "Get stressed out easily",
    "Worry about things", 
    "Am easily disturbed",
    "Get upset easily",
    "Change my mood a lot", 
    "Have frequent mood swings",
    "Get irritated easily",
    "Often feel blue"
]

## Factor V (Intellect or Imagination)

fac5_pos = [
    "Have a rich vocabulary",
    "Have a vivid imagination",
    "Have excellent ideas", 
    "Am quick to understand things",
    "Use difficult words", 
    "Spend time reflecting on things",
    "Am full of ideas"
]

fac5_neg = [
    "Have difficulty understanding abstract ideas",
    "Am not interested in abstract ideas",
    "Do not have a good imagination"
]


fac_all_neg = fac1_neg + fac2_neg + fac3_neg + fac4_neg + fac5_neg

fac_all_neg_cols = columnNamer(pd.Series(fac_all_neg), isBigFive=True, isHultDna=False)


## 
big_five.loc[:,fac_all_neg_cols].replace(5,1, inplace= True)
big_five.loc[:, fac_all_neg_cols].replace(4,2, inplace= True)

#### Creating Factors based on Original Study

In [None]:


fac1_all = fac1_pos + fac2_neg
fac2_all = fac2_pos + fac2_neg
fac3_all = fac3_pos + fac3_neg
fac4_all = fac4_pos + fac4_neg
fac5_all = fac4_pos + fac5_neg

big_five_final = pd.DataFrame(

{
    'extraversion':big_five[columnNamer(pd.Series(fac1_all), isBigFive=True, isHultDna=False)].sum(axis = 1),

    'agreeableness':big_five[columnNamer(pd.Series(fac2_all), isBigFive=True, isHultDna=False)].sum(axis = 1),

    'conscientiousness':big_five[columnNamer(pd.Series(fac3_all), isBigFive=True, isHultDna=False)].sum(axis = 1),
    
    'emotional_stability':big_five[columnNamer(pd.Series(fac4_all), isBigFive=True, isHultDna=False)].sum(axis = 1),

    'intellect':big_five[columnNamer(pd.Series(fac5_all), isBigFive=True, isHultDna=False)].sum(axis = 1)

})


## Modelling


### Clustering Big 5

The team has decided to proceed directly with KMeans clustering. Our reasoning is based on the fact that Goldberg's Big Five instrument used PCA to be developed (Exploratory Factor Analysis). Thus, we grouped the items according to the original author of the instrument and decided to cluster users by the originally traits of personality that were defined in Goldberg (1992).

<br> 

- https://ipip.ori.org/newBigFive5broadKey.htm#Conscientiousness
- https://ipip.ori.org/newBigFive5broadTable.htm

** INCLUDE APA CITATION HERE!!! GOLDBERG 1992**



### Scaling - StandardScaler

In [None]:
st_scaler = StandardScaler()

big_five_scaled = st_scaler.fit_transform(big_five_final)


### Cluster Selection

The team has chosen 2 clusters because is dividing the datapoints in a better manner. INSERT INERTIA INTERPRETATION HERE. 

In [None]:
inertia_plot(big_five_scaled,max_clust=10)

### Creating Clustering

In [None]:

k_means_big5 = KMeans(n_clusters=2,
                        random_state = 802)


# Fitting KMeans
k_means_big5_fit = k_means_big5.fit(big_five_scaled)

# Creating Clusters Labels
k_means_big5_clusters = pd.DataFrame({'Cluster':k_means_big5_fit.labels_})

# Checking if people are grouped in the clusters
k_means_big5_clusters.iloc[:,0].value_counts() # Both clusters are balanced 

In [None]:

# storing cluster centers
centroids_big5 = k_means_big5_fit.cluster_centers_


# converting cluster centers into a DataFrame
centroids_big5_df = pd.DataFrame(centroids_big5)


# renaming principal components
centroids_big5_df.columns = big_five_final.columns

centroids_big5_df



In [None]:

big_five_scaled_df = pd.DataFrame(big_five_scaled, 
                                  columns = ['extraversion',
                                             'agreeableness',
                                             'conscientiousness',
                                             'emotional_stability',
                                             'intellect'])

# creating Big5 merged dataframe with demographics

data_df = pd.concat([k_means_big5_clusters, demographics,big_five_scaled_df], axis = 1)

### Analysis of Demographics by Big Five

In [None]:
########################
# Current laptop
########################

# Balanced
fig, ax = plt.subplots(figsize = (12, 8))
sns.boxplot(x = 'Current laptop',
            y = 'agreeableness',
            hue = 'Cluster',
            data = data_df )


plt.tight_layout()
plt.show()
