# Adjectives describing bodyparts

In [98]:
import pandas as pd
import numpy as np
import spacy
import seaborn as sns
sns.set_theme(style="whitegrid", context="talk")
import matplotlib.pyplot as plt
import ast

In [99]:
#load data
df = pd.read_csv("/Users/thearolskovsloth/data-science-x-am/output/body_descriptions.csv")
#df.head(20)
df.head()

Unnamed: 0.1,Unnamed: 0,bodypart,owner,owner_gender,description,ID,author_gender
0,0,hands,her,F,[],PG67640_text,F
1,1,lung,her,F,['left'],PG67640_text,F
2,2,face,her,F,[],PG67640_text,F
3,3,lungs,his,M,[],PG67640_text,F
4,4,head,his,M,[],PG67640_text,F


In [100]:
#take only rows that have a description of the body part
df = df[df['description'].str.len() > 2]
len(df)

69593

In [101]:
#lowercase all body parts
df["bodypart"] = df["bodypart"].str.lower()
len(df)

69593

In [102]:
#lemmatize all the bodyparts
nlp = spacy.load("en_core_web_sm")
#nlp.disable_pipes("ner", "parser")
#nlp.max_length = 2568609

lemmas = df.bodypart.str.cat(sep=' ')
lemmas

lemmatized_bodyparts = nlp(lemmas)
len(lemmatized_bodyparts)

69593

In [103]:
#apply to all bodypart obs
lemmatized_body = [w.lemma_ for w in lemmatized_bodyparts]
len(lemmatized_body)

69593

In [104]:
#overwrite the bodypart column with the lemmatied bodyparts
df['bodypart'] = lemmatized_body
df.head(5)

Unnamed: 0.1,Unnamed: 0,bodypart,owner,owner_gender,description,ID,author_gender
1,1,lung,her,F,['left'],PG67640_text,F
5,5,lungs,his,M,['sick'],PG67640_text,F
10,10,nose,his,M,['pink'],PG61671_text,M
13,13,nose,his,M,['pink'],PG61671_text,M
14,14,nose,his,M,"['soft', 'tender']",PG61671_text,M


In [105]:
#manual lemmatizing :)

df['bodypart'] = np.where(df['bodypart'] == "lips", "lip", df['bodypart'])
df.loc[df['bodypart'] == "lips"]

Unnamed: 0.1,Unnamed: 0,bodypart,owner,owner_gender,description,ID,author_gender


In [106]:
df['description'] = df['description'].str.replace('[','')
df['description'] = df['description'].str.replace(']','')
df['description'] = df['description'].str.replace("'",'')
df


  df['description'] = df['description'].str.replace('[','')
  df['description'] = df['description'].str.replace(']','')


Unnamed: 0.1,Unnamed: 0,bodypart,owner,owner_gender,description,ID,author_gender
1,1,lung,her,F,left,PG67640_text,F
5,5,lungs,his,M,sick,PG67640_text,F
10,10,nose,his,M,pink,PG61671_text,M
13,13,nose,his,M,pink,PG61671_text,M
14,14,nose,his,M,"soft, tender",PG61671_text,M
...,...,...,...,...,...,...,...
466412,466412,lung,his,M,splendid,PG52248_text,M
466421,466421,eye,Fermin,M,sharp,PG52248_text,M
466424,466424,tongue,his,M,native,PG52248_text,M
466440,466440,foot,his,M,front,PG52248_text,M


In [107]:
#unfold the descriptions that has more than one word
df= df.assign(description=df.description.str.split(",")).explode('description')


In [108]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,bodypart,owner,owner_gender,description,ID,author_gender
1,1,lung,her,F,left,PG67640_text,F
5,5,lungs,his,M,sick,PG67640_text,F
10,10,nose,his,M,pink,PG61671_text,M
13,13,nose,his,M,pink,PG61671_text,M
14,14,nose,his,M,soft,PG61671_text,M
14,14,nose,his,M,tender,PG61671_text,M
19,19,eye,his,M,grey,PG66460_text,M
21,21,eye,his,M,grey,PG66460_text,M
29,29,shoulder,his,M,giant,PG66460_text,M
33,33,beard,his,M,great,PG66460_text,M


In [109]:
#make sure to remove everything else than the letters
import re
df['description'] = df['description'].map(lambda x: re.sub(r'\W+', '', x))
df

Unnamed: 0.1,Unnamed: 0,bodypart,owner,owner_gender,description,ID,author_gender
1,1,lung,her,F,left,PG67640_text,F
5,5,lungs,his,M,sick,PG67640_text,F
10,10,nose,his,M,pink,PG61671_text,M
13,13,nose,his,M,pink,PG61671_text,M
14,14,nose,his,M,soft,PG61671_text,M
...,...,...,...,...,...,...,...
466412,466412,lung,his,M,splendid,PG52248_text,M
466421,466421,eye,Fermin,M,sharp,PG52248_text,M
466424,466424,tongue,his,M,native,PG52248_text,M
466440,466440,foot,his,M,front,PG52248_text,M


In [110]:
#select only words of interest

interesting_bodyparts = ['eye', 'hand', 'face', 'arm', 'mouth', 'breast', 'heart', 'shoulder', 'lip', 'body', 'hair', 'back', 'brain', 'fist', 'chest', 'lap', 'waist', 'jaw', 'pupil']


df = df[df['bodypart'].isin(interesting_bodyparts)]
df

Unnamed: 0.1,Unnamed: 0,bodypart,owner,owner_gender,description,ID,author_gender
19,19,eye,his,M,grey,PG66460_text,M
21,21,eye,his,M,grey,PG66460_text,M
29,29,shoulder,his,M,giant,PG66460_text,M
36,36,eye,his,M,shrewd,PG66460_text,M
36,36,eye,his,M,bright,PG66460_text,M
...,...,...,...,...,...,...,...
466400,466400,hair,his,M,full,PG65818_text,M
466401,466401,hair,his,M,full,PG65818_text,M
466408,466408,hand,his,M,own,PG52248_text,M
466421,466421,eye,Fermin,M,sharp,PG52248_text,M


# split efter owner gender

In [111]:
#MALE AUTHORS - 2 dataframes, 1 for each owner gender

#male owners
male_owners_male_authors = df.loc[(df['owner_gender']=="M") & (df['author_gender']=="M")]

#female owners
female_owners_male_authors = df.loc[(df['owner_gender']=="F") & (df['author_gender']=="M")]

female_owners_male_authors.head()

Unnamed: 0.1,Unnamed: 0,bodypart,owner,owner_gender,description,ID,author_gender
42,42,hair,her,F,light,PG66460_text,M
271,271,eye,her,F,great,PG3479_text,M
271,271,eye,her,F,wide,PG3479_text,M
272,272,eye,her,F,great,PG3479_text,M
272,272,eye,her,F,wide,PG3479_text,M


In [112]:
#FEMALE AUTHORS - 2 dataframes, 1 for each owner gender

##HER SPLITTES DET EFTER OWNER GENDER

#male owners
male_owners_female_authors = df.loc[(df['owner_gender']=="M") & (df['author_gender']=="F")]

#female owners
female_owners_female_authors = df.loc[(df['owner_gender']=="F") & (df['author_gender']=="F")]


In [130]:
#count the combination of bodyparts and their corresponding descriptions

#female authors
female_owners_female_authors = female_owners_female_authors.groupby(['bodypart','description']).size().reset_index().rename(columns={0:'count_adj_ff'})
male_owners_female_authors = male_owners_female_authors.groupby(['bodypart','description']).size().reset_index().rename(columns={0:'count_adj_mf'})

#male authors
female_owners_male_authors = female_owners_male_authors.groupby(['bodypart','description']).size().reset_index().rename(columns={0:'count_adj_fm'})
male_owners_male_authors = male_owners_male_authors.groupby(['bodypart','description']).size().reset_index().rename(columns={0:'count_adj_mm'})

In [131]:
female_owners_female_authors

Unnamed: 0,bodypart,description,count_adj_ff
0,arm,Magnificent,1
1,arm,accustomed,1
2,arm,aching,1
3,arm,bandaged,1
4,arm,bare,1
...,...,...,...
2033,waist,supple,1
2034,waist,tailored,1
2035,waist,tight,1
2036,waist,tiny,1


In [132]:
adj_list = ['strong', 'left', 'right','own']

In [134]:

ff = female_owners_female_authors.loc[female_owners_female_authors['bodypart']=="arm"]
ff['std_female_owners_female_authors'] = ff.apply(lambda row: int(row.count_adj_ff)/ff['count_adj_ff'].sum(), axis=1)
ff = ff.sort_values(by = "std_female_owners_female_authors", axis=0, ascending=False)
ff = ff[ff['description'].isin(adj_list)]

mf = male_owners_female_authors.loc[male_owners_female_authors['bodypart']=="arm"]
mf['std_male_owners_female_authors'] = mf.apply(lambda row: int(row.count_adj_mf)/mf['count_adj_mf'].sum(), axis=1)
mf = mf.sort_values(by = "std_male_owners_female_authors", axis=0, ascending=False)
mf = mf[mf['description'].isin(adj_list)]

female_authors = pd.merge(ff, mf, on=['description'])
female_authors.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  arm_f['std_female_owners_female_authors'] = arm_f.apply(lambda row: int(row.count_adj_ff)/arm_f['count_adj_ff'].sum(), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  arm_m['std_male_owners_female_authors'] = arm_m.apply(lambda row: int(row.count_adj_mf)/arm_m['count_adj_mf'].sum(), axis=1)


Unnamed: 0,bodypart,description,count_adj_mf,std_male_owners_female_authors
83,arm,right,1,0.007246
73,arm,own,1,0.007246
105,arm,strong,1,0.007246
56,arm,left,1,0.007246


In [135]:
mm = male_owners_male_authors.loc[male_owners_male_authors['bodypart']=="arm"]
mm['std_male_owners_male_authors'] = mm.apply(lambda row: int(row.count_adj_fm)/mm['count_adj_mm'].sum(), axis=1)
mm = mm.sort_values(by = "std_male_owners_male_authors", axis=0, ascending=False)
mm = mm[mm['description'].isin(adj_list)]

fm = female_owners_female_authors.loc[female_owners_male_authors['bodypart']=="arm"]
fm['std_female_owners_male_authors'] = fm.apply(lambda row: int(row.count_adj_fm)/fm['count_adj_fm'].sum(), axis=1)
fm = mf.sort_values(by = "std_female_owners_male_authors", axis=0, ascending=False)
fm = mf[mf['description'].isin(adj_list)]

female_authors = pd.merge(mm, fm, on=['description'])
female_authors.head()

Unnamed: 0,bodypart_x,description,count_adj_ff,std_female_owners_female_authors,bodypart_y,count_adj_mf,std_male_owners_female_authors
0,arm,right,1,0.008403,arm,1,0.007246
1,arm,own,1,0.008403,arm,1,0.007246
2,arm,strong,1,0.008403,arm,1,0.007246
3,arm,left,1,0.008403,arm,1,0.007246


In [None]:
#sort alphabetically

male_authors= male_authors.sort_values(by = "bodypart", axis=0, ascending=True)
male_authors

#FEMALE AUTHORS
female_authors= female_authors.sort_values(by = "bodypart", axis=0, ascending=True)
female_authors