# 패럴림픽 선수들의 목표 시각화
1. 철학 (philosophy)
2. 야망 (ambition)
3. 영웅 (hero)

In [None]:
import pandas as pd
import re
from nltk import ngrams
from collections import Counter
import spacy
# English pipelines include a rule-based lemmatizer
nlp = spacy.load("en_core_web_sm")

## 0. Cleaning

In [4]:
df = pd.read_csv('athletes.csv')
df

Unnamed: 0,name,country,sports,age,gender,philosophy,ambition,hero
0,AAJIM Munkhbat,Mongolia,Judo,32,Male,,,
1,ABARZA Alberto,Chile,Swimming,36,Male,"""Swimming is not an individual sport. Certainl...",To compete at the 2020 Paralympic Games in Tok...,"Chilean tennis player Marcelo Rios, Brazilian ..."
2,ABASLI Namig,Azerbaijan,Judo,23,Male,,,
3,ABASSI Mostefa,Algeria,Wheelchair Basketball,43,Male,"""What doesn't kill you makes you stronger."" (A...",,"US basketballer LeBron James. (Athlete, 26 Aug..."
4,ABBAD Abderraouf,Algeria,Wheelchair Basketball,34,Male,,,"His father. (Athlete, 26 Aug 2021)"
...,...,...,...,...,...,...,...,...
4520,ZURABIANI Zurab,Georgia,Judo,21,Male,,,
4521,ZURBRUGG Lindsey,United States of America,Wheelchair Basketball,22,Female,"""My goal every day is to make another person h...",To compete at the 2020 Paralympic Games in Tok...,
4522,ZVINOWANDA Vimbai,Zimbabwe,Athletics,29,Female,,,"Zimbabwean swimmer Kirsty Coventry, Zimbabwean..."
4523,ZWOUKHI Fathi,Tunisia,Triathlon,35,Male,,,


In [17]:
def clean(string):
    if isinstance(string, str):
        string = re.sub('\(.+?\)$', '', string)
        string = re.sub('\"', '', string)
        if string.endswith(' '):
            string = string[:-1]
        return string
    else:
        return 'none'

In [18]:
df['philosophy'] = df['philosophy'].apply(lambda x: clean(x))
df['ambition'] = df['ambition'].apply(lambda x: clean(x))
df['hero'] = df['hero'].apply(lambda x: clean(x))

In [19]:
df

Unnamed: 0,name,country,sports,age,gender,philosophy,ambition,hero
0,AAJIM Munkhbat,Mongolia,Judo,32,Male,none,none,none
1,ABARZA Alberto,Chile,Swimming,36,Male,Swimming is not an individual sport. Certainly...,To compete at the 2020 Paralympic Games in Tokyo.,"Chilean tennis player Marcelo Rios, Brazilian ..."
2,ABASLI Namig,Azerbaijan,Judo,23,Male,none,none,none
3,ABASSI Mostefa,Algeria,Wheelchair Basketball,43,Male,What doesn't kill you makes you stronger.,none,US basketballer LeBron James.
4,ABBAD Abderraouf,Algeria,Wheelchair Basketball,34,Male,none,none,His father.
...,...,...,...,...,...,...,...,...
4520,ZURABIANI Zurab,Georgia,Judo,21,Male,none,none,none
4521,ZURBRUGG Lindsey,United States of America,Wheelchair Basketball,22,Female,My goal every day is to make another person ha...,To compete at the 2020 Paralympic Games in Tokyo.,none
4522,ZVINOWANDA Vimbai,Zimbabwe,Athletics,29,Female,none,none,"Zimbabwean swimmer Kirsty Coventry, Zimbabwean..."
4523,ZWOUKHI Fathi,Tunisia,Triathlon,35,Male,none,none,none


In [36]:
def lemmatize(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]

df['philosophy_lemma'] = df['philosophy'].apply(lambda x: lemmatize(x))
df['ambition_lemma'] = df['ambition'].apply(lambda x: lemmatize(x))

In [37]:
df.head()

Unnamed: 0,name,country,sports,age,gender,philosophy,ambition,hero,philosophy_lemma,ambition_lemma
0,AAJIM Munkhbat,Mongolia,Judo,32,Male,none,none,none,[none],[none]
1,ABARZA Alberto,Chile,Swimming,36,Male,Swimming is not an individual sport. Certainly...,To compete at the 2020 Paralympic Games in Tokyo.,"Chilean tennis player Marcelo Rios, Brazilian ...","[swimming, be, not, an, individual, sport, ., ...","[to, compete, at, the, 2020, Paralympic, Games..."
2,ABASLI Namig,Azerbaijan,Judo,23,Male,none,none,none,[none],[none]
3,ABASSI Mostefa,Algeria,Wheelchair Basketball,43,Male,What doesn't kill you makes you stronger.,none,US basketballer LeBron James.,"[what, do, not, kill, -PRON-, make, -PRON-, st...",[none]
4,ABBAD Abderraouf,Algeria,Wheelchair Basketball,34,Male,none,none,His father.,[none],[none]


In [46]:
def hero_list(hero):
    return [re.sub('^ | $|\.', '', h) for h in hero.split(',')]

df['hero_list'] = df['hero'].apply(lambda x: hero_list(x))

In [47]:
df.head()

Unnamed: 0,name,country,sports,age,gender,philosophy,ambition,hero,philosophy_lemma,ambition_lemma,hero_list
0,AAJIM Munkhbat,Mongolia,Judo,32,Male,none,none,none,[none],[none],[none]
1,ABARZA Alberto,Chile,Swimming,36,Male,Swimming is not an individual sport. Certainly...,To compete at the 2020 Paralympic Games in Tokyo.,"Chilean tennis player Marcelo Rios, Brazilian ...","[swimming, be, not, an, individual, sport, ., ...","[to, compete, at, the, 2020, Paralympic, Games...","[Chilean tennis player Marcelo Rios, Brazilian..."
2,ABASLI Namig,Azerbaijan,Judo,23,Male,none,none,none,[none],[none],[none]
3,ABASSI Mostefa,Algeria,Wheelchair Basketball,43,Male,What doesn't kill you makes you stronger.,none,US basketballer LeBron James.,"[what, do, not, kill, -PRON-, make, -PRON-, st...",[none],[US basketballer LeBron James]
4,ABBAD Abderraouf,Algeria,Wheelchair Basketball,34,Male,none,none,His father.,[none],[none],[His father]


In [48]:
df.to_csv('athletes_cleaned.csv', index=False, encoding='utf-8')

## 1. Philosophy & Ambition

In [51]:
def bigram(lemmas):
    return list(ngrams(lemmas, 2))

df['philosophy_bigram'] = df['philosophy_lemma'].apply(lambda x: bigram(x))
df['ambition_bigram'] = df['ambition_lemma'].apply(lambda x: bigram(x))

In [55]:
def trigram(lemmas):
    try:
        return list(ngrams(lemmas, 3))
    except: # ['none']
        return []

df['philosophy_trigram'] = df['philosophy_lemma'].apply(lambda x: trigram(x))
df['ambition_trigram'] = df['ambition_lemma'].apply(lambda x: trigram(x))

In [56]:
df.sample(10)

Unnamed: 0,name,country,sports,age,gender,philosophy,ambition,hero,philosophy_lemma,ambition_lemma,hero_list,philosophy_bigram,ambition_bigram,philosophy_trigram,ambition_trigram
3723,SODARIO TORQUATO Nathan Cesar,Brazil,Taekwondo,20,Male,none,To win gold at the 2020 Paralympic Games in To...,none,[none],"[to, win, gold, at, the, 2020, Paralympic, Gam...",[none],[],"[(to, win), (win, gold), (gold, at), (at, the)...",[],"[(to, win, gold), (win, gold, at), (gold, at, ..."
4045,TSUCHIYA Minako,Japan,Judo,31,Female,To live without regrets.,To be happy in my life.,none,"[to, live, without, regret, .]","[to, be, happy, in, -PRON-, life, .]",[none],"[(to, live), (live, without), (without, regret...","[(to, be), (be, happy), (happy, in), (in, -PRO...","[(to, live, without), (live, without, regret),...","[(to, be, happy), (be, happy, in), (happy, in,..."
3073,PEDRELLI Roberta,Italy,Sitting Volleyball,42,Female,Never look back and always go forward. Live ev...,To compete at the 2020 Paralympic Games in Tokyo.,none,"[never, look, back, and, always, go, forward, ...","[to, compete, at, the, 2020, Paralympic, Games...",[none],"[(never, look), (look, back), (back, and), (an...","[(to, compete), (compete, at), (at, the), (the...","[(never, look, back), (look, back, and), (back...","[(to, compete, at), (compete, at, the), (at, t..."
2456,MARCHI Giorgia,Italy,Swimming,20,Female,none,none,none,[none],[none],[none],[],[],[],[]
2839,NICOLAI Boris,Germany,Boccia,36,Male,"If you give only 90% in training, you can not ...",none,Swiss tennis player Roger Federer.,"[if, -PRON-, give, only, 90, %, in, training, ...",[none],[Swiss tennis player Roger Federer],"[(if, -PRON-), (-PRON-, give), (give, only), (...",[],"[(if, -PRON-, give), (-PRON-, give, only), (gi...",[]
3252,RADOVIC Filip,Montenegro,Table Tennis,21,Male,none,To compete at the 2020 Paralympic Games in Tokyo.,Polish table tennis player Natalia Partyka.,[none],"[to, compete, at, the, 2020, Paralympic, Games...",[Polish table tennis player Natalia Partyka],[],"[(to, compete), (compete, at), (at, the), (the...",[],"[(to, compete, at), (compete, at, the), (at, t..."
2449,MAO Jingdian,People's Republic of China,Table Tennis,26,Female,Be the first.,To compete at the 2020 Paralympic Games in Tokyo.,none,"[be, the, first, .]","[to, compete, at, the, 2020, Paralympic, Games...",[none],"[(be, the), (the, first), (first, .)]","[(to, compete), (compete, at), (at, the), (the...","[(be, the, first), (the, first, .)]","[(to, compete, at), (compete, at, the), (at, t..."
3836,SUZUKI Ayako,Japan,Badminton,34,Female,She has competed in able-bodied badminton at n...,To win gold at the 2020 Paralympic Games in To...,none,"[-PRON-, have, compete, in, able, -, bodied, b...","[to, win, gold, at, the, 2020, Paralympic, Gam...",[none],"[(-PRON-, have), (have, compete), (compete, in...","[(to, win), (win, gold), (gold, at), (at, the)...","[(-PRON-, have, compete), (have, compete, in),...","[(to, win, gold), (win, gold, at), (gold, at, ..."
364,BEHRE David,Germany,Athletics,34,Male,If I feel pain during training and just do not...,To win a medal at the 2020 Paralympic Games in...,South African Para athlete Oscar Pistorius.,"[if, -PRON-, feel, pain, during, training, and...","[to, win, a, medal, at, the, 2020, Paralympic,...",[South African Para athlete Oscar Pistorius],"[(if, -PRON-), (-PRON-, feel), (feel, pain), (...","[(to, win), (win, a), (a, medal), (medal, at),...","[(if, -PRON-, feel), (-PRON-, feel, pain), (fe...","[(to, win, a), (win, a, medal), (a, medal, at)..."
1744,ISHIURA Tomomi,Japan,Swimming,33,Female,Endurance makes you stronger.,To win gold at the 2020 Paralympic Games in To...,Japanese Para swimmer Rina Akiyama.,"[endurance, make, -PRON-, strong, .]","[to, win, gold, at, the, 2020, Paralympic, Gam...",[Japanese Para swimmer Rina Akiyama],"[(endurance, make), (make, -PRON-), (-PRON-, s...","[(to, win), (win, gold), (gold, at), (at, the)...","[(endurance, make, -PRON-), (make, -PRON-, str...","[(to, win, gold), (win, gold, at), (gold, at, ..."


In [79]:
def ngram_counter(col_name):
    ngrams = []
    for col in df[col_name].tolist():
        ngrams += [c for c in col if not any(sw in c for sw in ['-PRON-', 'be', 'do', 'the', 'there', 'a', 'and', '.', ',','[',']'])]
    result = pd.DataFrame(Counter(ngrams).most_common(20), columns=['ngram', 'freq'])
    result['ratio (%)'] = (result['freq'] / len(ngrams)) * 100
    return result

In [80]:
ngram_counter('philosophy_bigram')

Unnamed: 0,ngram,freq,ratio (%)
0,"(give, up)",212,1.43311
1,"(never, give)",158,1.068073
2,"(want, to)",116,0.784155
3,"(have, to)",105,0.709795
4,"(can, not)",93,0.628676
5,"(believe, in)",76,0.513757
6,"(work, hard)",74,0.500237
7,"(in, life)",62,0.419117
8,"(every, day)",54,0.365038
9,"(to, achieve)",45,0.304198


In [81]:
ngram_counter('philosophy_trigram')

Unnamed: 0,ngram,freq,ratio (%)
0,"(never, give, up)",157,1.978576
1,"(not, give, up)",33,0.415879
2,"(as, long, as)",23,0.289855
3,"(most, important, thing)",14,0.176434
4,"(to, give, up)",10,0.126024
5,"(no, matter, what)",10,0.126024
6,"(no, matter, how)",10,0.126024
7,"(give, up, on)",10,0.126024
8,"(about, how, hard)",9,0.113422
9,"(with, an, impairment)",9,0.113422


In [83]:
ngram_counter('ambition_bigram')

Unnamed: 0,ngram,freq,ratio (%)
0,"(Paralympic, Games)",3013,14.946919
1,"(Games, in)",2708,13.433872
2,"(in, Tokyo)",2574,12.769124
3,"(2020, Paralympic)",2550,12.650064
4,"(to, win)",1513,7.505705
5,"(to, compete)",1494,7.41145
6,"(compete, at)",1457,7.2279
7,"(medal, at)",1034,5.129477
8,"(win, gold)",440,2.182756
9,"(gold, medal)",404,2.004167


In [84]:
ngram_counter('ambition_trigram')

Unnamed: 0,ngram,freq,ratio (%)
0,"(Paralympic, Games, in)",2634,21.402454
1,"(Games, in, Tokyo)",2572,20.898676
2,"(2020, Paralympic, Games)",2549,20.71179
3,"(to, compete, at)",1451,11.790038
4,"(to, win, gold)",434,3.526448
5,"(win, gold, at)",396,3.217681
6,"(gold, medal, at)",377,3.063297
7,"(2024, Paralympic, Games)",108,0.877549
8,"(Games, in, Paris)",77,0.62566
9,"(to, compete, in)",37,0.300642


In [23]:
# https://www.jasondavies.com/wordtree/
philosophies = [philosophy for philosophy in df.philosophy.tolist() if philosophy != 'none']
with open('philosophies.txt', 'w') as f:
    f.write("\n".join(philosophies))

ambitions = [ambition for ambition in df.ambition.tolist() if ambition != 'none']
with open('ambitions.txt', 'w') as f:
    f.write("\n".join(ambitions))

## 2. Hero

In [87]:
heros = []
for h in df.hero_list.tolist():
    heros += [x for x in h if x != 'none']

In [88]:
result = pd.DataFrame(Counter(heros).most_common(20), columns=['hero', 'freq'])
result['ratio (%)'] = (result['freq'] / len(heros)) * 100
result

Unnamed: 0,hero,freq,ratio (%)
0,US swimmer Michael Phelps,106,4.18311
1,Jamaican sprinter Usain Bolt,94,3.70955
2,Swiss tennis player Roger Federer,44,1.736385
3,His father,36,1.420679
4,Argentinian footballer Lionel Messi,33,1.302289
5,US basketball player Michael Jordan,32,1.262826
6,Spanish tennis player Rafael Nadal,30,1.183899
7,Portuguese footballer Cristiano Ronaldo,29,1.144436
8,US boxer Muhammad Ali,23,0.907656
9,Her mother,18,0.710339


In [110]:
heros_nationality = []
for h in df.hero_list.tolist():
    heros_nationality += [x.split()[0] for x in h if x != 'none' and x.split()[0] not in ['His', 'Her']]

In [112]:
# 영웅으로 지목된 사람들의 국적
result = pd.DataFrame(Counter(heros_nationality).most_common(20), columns=['hero', 'freq'])
result['ratio (%)'] = (result['freq'] / len(heros)) * 100
result

Unnamed: 0,hero,freq,ratio (%)
0,US,397,15.66693
1,British,134,5.288082
2,Brazilian,112,4.41989
3,Russian,109,4.3015
4,Australian,107,4.222573
5,Jamaican,101,3.985793
6,Spanish,94,3.70955
7,Chinese,78,3.078137
8,French,75,2.959747
9,German,70,2.762431


In [113]:
# 영웅을 지목한 사람들의 국적
result = pd.DataFrame(Counter(df['country'].tolist()).most_common(20), columns=['hero', 'freq'])
result['ratio (%)'] = (result['freq'] / len(heros)) * 100
result

Unnamed: 0,hero,freq,ratio (%)
0,Japan,262,10.339384
1,People's Republic of China,256,10.102605
2,RPC,248,9.786898
3,United States of America,243,9.589582
4,Brazil,241,9.510655
5,Great Britain,221,8.721389
6,Australia,181,7.142857
7,France,145,5.722178
8,Ukraine,139,5.485399
9,Spain,137,5.406472


In [90]:
# 영웅을 지목한 선수들 중 육상, 수영 선수의 비중이 가장 높음 
# 영웅으로 지목된 선수들 중에도 육상, 수영 선수의 비율이 높음 
result = pd.DataFrame(Counter(df['sports'].tolist()).most_common(20), columns=['hero', 'freq'])
result['ratio (%)'] = (result['freq'] / len(heros)) * 100
result

Unnamed: 0,hero,freq,ratio (%)
0,Athletics,1142,45.067088
1,Swimming,604,23.835833
2,Table Tennis,278,10.970797
3,Wheelchair Basketball,262,10.339384
4,Cycling Road,213,8.405683
5,Sitting Volleyball,187,7.379637
6,Powerlifting,178,7.024467
7,Shooting,153,6.037885
8,Archery,139,5.485399
9,Judo,136,5.367009


In [107]:
condition = 'sports'
all_cases = sorted(list(set(df[condition].tolist())))
for case in all_cases:
    heros = []
    for h in df.query('sports==@case').hero_list.tolist():
        heros += [x.replace('His ', '').replace('Her ', '').replace('his ', '').replace('her ', '') for x in h if x != 'none']
    result = pd.DataFrame(Counter(heros).most_common(20), columns=['hero', 'freq'])
    result['ratio (%)'] = (result['freq'] / len(heros)) * 100
    print('********')
    print(case)
    print(result)

********
 Archery
                                              hero  freq  ratio (%)
0                         Dutch arcMike Schloesser     4   8.163265
1                                           mother     3   6.122449
2             Iranian Para arcAlisina Manshaezadeh     2   4.081633
3                  Czech Para arcDavid Drahoninsky     2   4.081633
4        Brazilian basketball player Oscar Schmidt     1   2.040816
5      South African rugby player Francois Pienaar     1   2.040816
6                      English Para arcJohn Stubbs     1   2.040816
7                          Colombian arcSara Lopez     1   2.040816
8                                US arcRandy Ulmer     1   2.040816
9          Italian footballer Alessandro Del Piero     1   2.040816
10           English folklore character Robin Hood     1   2.040816
11                         Czech arcJaromir Termer     1   2.040816
12                          US arcJesse Broadwater     1   2.040816
13                  Irish jock

********
 Judo
                                                 hero  freq  ratio (%)
0                                              father     3   6.818182
1                          Japanese judoka Shohei Ono     2   4.545455
2                           French judoka Teddy Riner     2   4.545455
3                     Japanese judoka Tadahiro Nomura     2   4.545455
4   Brazilians judokas Rogerio Sampaio and Aurelio...     1   2.272727
5               Brazilian Para judoka Antonio Tenorio     1   2.272727
6                Brazilian Para judoka Roberto Julian     1   2.272727
7                       Brazilian judoka Rafael Silva     1   2.272727
8           Brazilian judoka Antonio Tenorio da Silva     1   2.272727
9                     Brazilian judoka Eduardo Santos     1   2.272727
10                        Korean judoka Jeon Ki Young     1   2.272727
11                         Greek judoka Ilias Iliadis     1   2.272727
12                        Canadian singer Celine Dion     1   

 Table Tennis
                                                hero  freq  ratio (%)
0                 Spanish tennis player Rafael Nadal    11   7.971014
1        Swedish table tennis player Jan-Ove Waldner     8   5.797101
2                  Swiss tennis player Roger Federer     6   4.347826
3                Chinese table tennis player Ma Long     5   3.623188
4         Polish table tennis player Natalia Partyka     5   3.623188
5                US basketball player Michael Jordan     4   2.898551
6               German table tennis player Timo Boll     4   2.898551
7               Serbian tennis player Novak Djokovic     4   2.898551
8                                             father     3   2.173913
9                              US boxer Muhammad Ali     3   2.173913
10    Swedish Para table tennis player Ernst Bollden     2   1.449275
11               Argentinian footballer Lionel Messi     2   1.449275
12            Chinese table tennis player Zhang Jike     2   1.449275
13  Be

19                   Korean tennis player Chung Hyeon     1   1.351351


In [119]:
condition = 'gender'
all_cases = sorted(list(set(df[condition].tolist())))
for case in all_cases:
    heros = []
    for h in df.query('gender==@case').hero_list.tolist():
        heros += [x.replace('His ', '').replace('Her ', '').replace('his ', '').replace('her ', '') for x in h if x != 'none']
    result = pd.DataFrame(Counter(heros).most_common(20), columns=['hero', 'freq'])
    result['ratio (%)'] = (result['freq'] / len(heros)) * 100
    print('********')
    print(case)
    print(result)

********
Female
                                            hero  freq  ratio (%)
0                      US swimmer Michael Phelps    34   3.285024
1                   Jamaican sprinter Usain Bolt    31   2.995169
2                                         mother    20   1.932367
3                                         father    12   1.159420
4            British Para swimmer Ellie Simmonds    11   1.062802
5            Argentinian footballer Lionel Messi    10   0.966184
6                                        parents     9   0.869565
7                      US sprinter Allyson Felix     8   0.772947
8              Swiss tennis player Roger Federer     8   0.772947
9             Spanish tennis player Rafael Nadal     8   0.772947
10                Russian swimmer Yuliya Efimova     7   0.676329
11              Hungarian swimmer Katinka Hosszu     6   0.579710
12  Canadian wheelchair racer Chantal Petitclerc     6   0.579710
13     Dutch wheelchair tennis player EstVergeer     6   0.5