### Chapter 7 Activities



In [20]:
# Import necessary Libraries
import pandas as pd
import chardet #To identify Encoding Type

In [21]:
# Identifying the encoding type & Reading Candy Hierarchry CSV

with open('candyhierarchy2017.csv', 'rb') as f:
    result = chardet.detect(f.read())

df = pd.read_csv('candyhierarchy2017.csv', encoding=result['encoding'])
df.head(5)

Unnamed: 0,Internal ID,Q1: GOING OUT?,Q2: GENDER,Q3: AGE,Q4: COUNTRY,"Q5: STATE, PROVINCE, COUNTY, ETC",Q6 | 100 Grand Bar,Q6 | Anonymous brown globs that come in black and orange wrappers\t(a.k.a. Mary Janes),Q6 | Any full-sized candy bar,Q6 | Black Jacks,...,Q8: DESPAIR OTHER,Q9: OTHER COMMENTS,Q10: DRESS,Unnamed: 113,Q11: DAY,Q12: MEDIA [Daily Dish],Q12: MEDIA [Science],Q12: MEDIA [ESPN],Q12: MEDIA [Yahoo],"Click Coordinates (x, y)"
0,90258773,,,,,,,,,,...,,,,,,,,,,
1,90272821,No,Male,44.0,USA,NM,MEH,DESPAIR,JOY,MEH,...,,Bottom line is Twix is really the only candy w...,White and gold,,Sunday,,1.0,,,"(84, 25)"
2,90272829,,Male,49.0,USA,Virginia,,,,,...,,,,,,,,,,
3,90272840,No,Male,40.0,us,or,MEH,DESPAIR,JOY,MEH,...,,Raisins can go to hell,White and gold,,Sunday,,1.0,,,"(75, 23)"
4,90272841,No,Male,23.0,usa,exton pa,JOY,DESPAIR,JOY,DESPAIR,...,,,White and gold,,Friday,,1.0,,,"(70, 10)"


In [22]:
# Filter out missing data
df = df.dropna(subset=['Q2: GENDER', 'Q3: AGE', 'Q4: COUNTRY']) #Drop rows where key identifying columns are missing
df.head(5)

Unnamed: 0,Internal ID,Q1: GOING OUT?,Q2: GENDER,Q3: AGE,Q4: COUNTRY,"Q5: STATE, PROVINCE, COUNTY, ETC",Q6 | 100 Grand Bar,Q6 | Anonymous brown globs that come in black and orange wrappers\t(a.k.a. Mary Janes),Q6 | Any full-sized candy bar,Q6 | Black Jacks,...,Q8: DESPAIR OTHER,Q9: OTHER COMMENTS,Q10: DRESS,Unnamed: 113,Q11: DAY,Q12: MEDIA [Daily Dish],Q12: MEDIA [Science],Q12: MEDIA [ESPN],Q12: MEDIA [Yahoo],"Click Coordinates (x, y)"
1,90272821,No,Male,44,USA,NM,MEH,DESPAIR,JOY,MEH,...,,Bottom line is Twix is really the only candy w...,White and gold,,Sunday,,1.0,,,"(84, 25)"
2,90272829,,Male,49,USA,Virginia,,,,,...,,,,,,,,,,
3,90272840,No,Male,40,us,or,MEH,DESPAIR,JOY,MEH,...,,Raisins can go to hell,White and gold,,Sunday,,1.0,,,"(75, 23)"
4,90272841,No,Male,23,usa,exton pa,JOY,DESPAIR,JOY,DESPAIR,...,,,White and gold,,Friday,,1.0,,,"(70, 10)"
6,90272853,No,Male,53,usa,Colorado,,,,,...,,,,,,,,,,


In [23]:
#Fill in missing data

#Fill missing data in text columns with a placeholder
text_cols = df.select_dtypes(include='object').columns
df[text_cols] = df[text_cols].fillna("Unknown")

#Fill missing numeric values with the mean
num_cols = df.select_dtypes(include='number').columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

#Verify no missing values left
print(df.isnull().sum())

Internal ID                 0
Q1: GOING OUT?              0
Q2: GENDER                  0
Q3: AGE                     0
Q4: COUNTRY                 0
                           ..
Q12: MEDIA [Daily Dish]     0
Q12: MEDIA [Science]        0
Q12: MEDIA [ESPN]           0
Q12: MEDIA [Yahoo]          0
Click Coordinates (x, y)    0
Length: 120, dtype: int64


In [24]:
# Remove duplicates
df = df.drop_duplicates()

print("Total duplicate rows:", df.duplicated().sum()) # Check for Duplicates

Total duplicate rows: 0


In [25]:
# Transform data using mapping for Gender
gender_map = {
    'male': 'Male',
    'm': 'Male',
    'M': 'Male',
    'female': 'Female',
    'f': 'Female',
    'F': 'Female'
}

df['Q2: GENDER'] = df['Q2: GENDER'].str.strip().str.lower().map(gender_map)

df.head(5)

Unnamed: 0,Internal ID,Q1: GOING OUT?,Q2: GENDER,Q3: AGE,Q4: COUNTRY,"Q5: STATE, PROVINCE, COUNTY, ETC",Q6 | 100 Grand Bar,Q6 | Anonymous brown globs that come in black and orange wrappers\t(a.k.a. Mary Janes),Q6 | Any full-sized candy bar,Q6 | Black Jacks,...,Q8: DESPAIR OTHER,Q9: OTHER COMMENTS,Q10: DRESS,Unnamed: 113,Q11: DAY,Q12: MEDIA [Daily Dish],Q12: MEDIA [Science],Q12: MEDIA [ESPN],Q12: MEDIA [Yahoo],"Click Coordinates (x, y)"
1,90272821,No,Male,44,USA,NM,MEH,DESPAIR,JOY,MEH,...,Unknown,Bottom line is Twix is really the only candy w...,White and gold,Unknown,Sunday,1.0,1.0,1.0,1.0,"(84, 25)"
2,90272829,Unknown,Male,49,USA,Virginia,Unknown,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,1.0,1.0,1.0,1.0,Unknown
3,90272840,No,Male,40,us,or,MEH,DESPAIR,JOY,MEH,...,Unknown,Raisins can go to hell,White and gold,Unknown,Sunday,1.0,1.0,1.0,1.0,"(75, 23)"
4,90272841,No,Male,23,usa,exton pa,JOY,DESPAIR,JOY,DESPAIR,...,Unknown,Unknown,White and gold,Unknown,Friday,1.0,1.0,1.0,1.0,"(70, 10)"
6,90272853,No,Male,53,usa,Colorado,Unknown,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,1.0,1.0,1.0,1.0,Unknown


In [26]:
# Age Group Transformation using Custom Function

def age_group(age):
    try:
        age = int(age)
        if age < 18:
            return 'Child'
        elif age < 35:
            return 'Young Adult'
        elif age < 60:
            return 'Adult'
        else:
            return 'Senior'
    except:
        return 'Unknown'

df['Age Group'] = df['Q3: AGE'].apply(age_group)

df.head(5)

Unnamed: 0,Internal ID,Q1: GOING OUT?,Q2: GENDER,Q3: AGE,Q4: COUNTRY,"Q5: STATE, PROVINCE, COUNTY, ETC",Q6 | 100 Grand Bar,Q6 | Anonymous brown globs that come in black and orange wrappers\t(a.k.a. Mary Janes),Q6 | Any full-sized candy bar,Q6 | Black Jacks,...,Q9: OTHER COMMENTS,Q10: DRESS,Unnamed: 113,Q11: DAY,Q12: MEDIA [Daily Dish],Q12: MEDIA [Science],Q12: MEDIA [ESPN],Q12: MEDIA [Yahoo],"Click Coordinates (x, y)",Age Group
1,90272821,No,Male,44,USA,NM,MEH,DESPAIR,JOY,MEH,...,Bottom line is Twix is really the only candy w...,White and gold,Unknown,Sunday,1.0,1.0,1.0,1.0,"(84, 25)",Adult
2,90272829,Unknown,Male,49,USA,Virginia,Unknown,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,1.0,1.0,1.0,1.0,Unknown,Adult
3,90272840,No,Male,40,us,or,MEH,DESPAIR,JOY,MEH,...,Raisins can go to hell,White and gold,Unknown,Sunday,1.0,1.0,1.0,1.0,"(75, 23)",Adult
4,90272841,No,Male,23,usa,exton pa,JOY,DESPAIR,JOY,DESPAIR,...,Unknown,White and gold,Unknown,Friday,1.0,1.0,1.0,1.0,"(70, 10)",Young Adult
6,90272853,No,Male,53,usa,Colorado,Unknown,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,1.0,1.0,1.0,1.0,Unknown,Adult


In [28]:
# Replace values using Mapping

# Standardize text: lowercase and strip whitespace
df['Q4: COUNTRY'] = df['Q4: COUNTRY'].astype(str).str.strip().str.lower()

# Define the mapping dictionary
country_map = {
    # USA variations
    'usa': 'USA', 'us': 'USA', 'united states': 'USA', 'united states of america': 'USA',
    'america': 'USA', 'murica': 'USA', 'u.s.a.': 'USA', 'u.s.': 'USA', 'u s': 'USA',
    'us of a': 'USA', 'united state': 'USA', 'unied states': 'USA', 'unites states': 'USA',
    'united staes': 'USA', 'united statss': 'USA', 'united ststes': 'USA',
    'usa? hard to tell anymore..': 'USA', 'usa! usa! usa!': 'USA', 'usaa': 'USA',
    "'merica": 'USA', 'usausausa': 'USA', 'i pretend to be from canada, but i am really from the united states.': 'USA',
    'north carolina': 'USA', 'new jersey': 'USA', 'california': 'USA', 'pittsburgh': 'USA',
    
    # Canada variations
    'canada': 'Canada', 'canada ': 'Canada', 'can': 'Canada', 'canae': 'Canada',
    'canada`': 'Canada', 'soviet canuckistan': 'Canada',

    # UK variations
    'uk': 'UK', 'u.k.': 'UK', 'united kingdom': 'UK', 'england': 'UK',
    'uk ': 'UK', 'scotland': 'UK', 'scotland ': 'UK', 'endland': 'UK',

    # Other countries
    'france': 'France', 'france ': 'France', 'germany': 'Germany', 'japan': 'Japan',
    'china': 'China', 'australia': 'Australia', 'greece': 'Greece', 'iceland': 'Iceland',
    'indonesia': 'Indonesia', 'ireland': 'Ireland', 'ireland ': 'Ireland',
    'south africa': 'South Africa', 'south korea': 'South Korea', 'sweden': 'Sweden',
    'switzerland': 'Switzerland', 'taiwan': 'Taiwan', 'spain': 'Spain',
    'singapore': 'Singapore', 'hong kong': 'Hong Kong', 'the netherlands': 'Netherlands',
    'netherlands': 'Netherlands',

    # Others / unclear
    'earth': 'Other', 'atlantis': 'Other', 'narnia': 'Other', 'fear and loathing': 'Other',
    'i don\'t know anymore': 'Other', 'insanity lately': 'Other', 'trumpistan': 'Other',
    'ahem....amerca': 'Other', '1': 'Other', 'ud': 'Other', 'a': 'Other',
    'new york': 'USA', 'europe': 'Other', 'unhinged states': 'USA',
    'cascadia': 'Other', 'n. america': 'Other', 'subscribe to dm4uz3 on youtube': 'Other'
}

# Apply mapping
df['Q4: COUNTRY'] = df['Q4: COUNTRY'].map(country_map).fillna('Other')

df.head(5)

Unnamed: 0,Internal ID,Q1: GOING OUT?,Q2: GENDER,Q3: AGE,Q4: COUNTRY,"Q5: STATE, PROVINCE, COUNTY, ETC",Q6 | 100 Grand Bar,Q6 | Anonymous brown globs that come in black and orange wrappers\t(a.k.a. Mary Janes),Q6 | Any full-sized candy bar,Q6 | Black Jacks,...,Q9: OTHER COMMENTS,Q10: DRESS,Unnamed: 113,Q11: DAY,Q12: MEDIA [Daily Dish],Q12: MEDIA [Science],Q12: MEDIA [ESPN],Q12: MEDIA [Yahoo],"Click Coordinates (x, y)",Age Group
1,90272821,No,Male,44,USA,NM,MEH,DESPAIR,JOY,MEH,...,Bottom line is Twix is really the only candy w...,White and gold,Unknown,Sunday,1.0,1.0,1.0,1.0,"(84, 25)",Adult
2,90272829,Unknown,Male,49,USA,Virginia,Unknown,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,1.0,1.0,1.0,1.0,Unknown,Adult
3,90272840,No,Male,40,USA,or,MEH,DESPAIR,JOY,MEH,...,Raisins can go to hell,White and gold,Unknown,Sunday,1.0,1.0,1.0,1.0,"(75, 23)",Adult
4,90272841,No,Male,23,USA,exton pa,JOY,DESPAIR,JOY,DESPAIR,...,Unknown,White and gold,Unknown,Friday,1.0,1.0,1.0,1.0,"(70, 10)",Young Adult
6,90272853,No,Male,53,USA,Colorado,Unknown,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,1.0,1.0,1.0,1.0,Unknown,Adult
