In [2]:
import pandas as pd

from sklearn.dummy import DummyClassifier           # <== Najprostszy możliwy model
from sklearn.linear_model import LogisticRegression # <== Regresja logistyczna (liniowa)

from sklearn.metrics import accuracy_score

In [None]:
# /Users/p/Documents/dev/Machine-Learning/001-basic-intro_python_numpy_pandas/input

In [3]:
df = pd.read_csv("/Users/p/Documents/dev/Machine-Learning/001-basic-intro_python_numpy_pandas/input/polish_names.csv")
df.head()

Unnamed: 0,name,gender
0,Abdon,m
1,Abel,m
2,Abercjusz,m
3,Abraham,m
4,Absalon,m


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1705 entries, 0 to 1704
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    1705 non-null   object
 1   gender  1705 non-null   object
dtypes: object(2)
memory usage: 26.8+ KB


In [5]:
df.sample(10)

Unnamed: 0,name,gender
280,Cibor,m
1236,Selma,f
1125,Przemysław,m
123,Atanazy,m
1461,Wilma,f
1429,Wawrzyniec,m
262,Celestyn,m
304,Czechoń,m
1175,René,m
1700,Zbigniewa,f


In [6]:
df['gender'].value_counts()

gender
m    1033
f     672
Name: count, dtype: int64

In [10]:
def transform_string_into_number(string):
    return 1 if string == 'm' else 0

df['gender'].head().map( transform_string_into_number )

0    1
1    1
2    1
3    1
4    1
Name: gender, dtype: int64

In [11]:
df['target'] = df['gender'].map( lambda x: int(x == 'm') )
df.head(10)

Unnamed: 0,name,gender,target
0,Abdon,m,1
1,Abel,m,1
2,Abercjusz,m,1
3,Abraham,m,1
4,Absalon,m,1
5,Achacjusz,m,1
6,Achacy,m,1
7,Achilles,m,1
8,Ada,f,0
9,Adalbert,m,1


In [12]:
df['dl'] = df['name'].map(lambda x: len(x))
df.head(10)

Unnamed: 0,name,gender,target,dl
0,Abdon,m,1,5
1,Abel,m,1,4
2,Abercjusz,m,1,9
3,Abraham,m,1,7
4,Absalon,m,1,7
5,Achacjusz,m,1,9
6,Achacy,m,1,6
7,Achilles,m,1,8
8,Ada,f,0,3
9,Adalbert,m,1,8


### DummyClassifier

In [13]:
X = df[ ['dl'] ].values # Macierz cech
y = df['target'].values # Wektor zmiennej docelowej

model = DummyClassifier(strategy = 'stratified')
model.fit(X, y) # Model “uczy się” (w tym przypadku po prostu zapamiętuje proporcje klas z danych wejściowych).
y_pred = model.predict(X) # Model przewiduje etykiety dla tego samego zbioru X na podstawie swojej strategii.

In [14]:
y_pred

array([1, 1, 0, ..., 1, 1, 0])

In [15]:
df['gender_pred'] = y_pred
df['gender_pred'].value_counts()

gender_pred
1    1004
0     701
Name: count, dtype: int64

In [16]:
df[ df.target != y_pred ].shape # Kod ten pozwala znaleźć liczbę próbek, dla których model popełnił błąd predykcji (czyli rzeczywista wartość target nie zgadza się z y_pred).

(761, 5)

In [17]:
accuracy_score(y, y_pred) # Wywołanie funkcji accuracy_score(y, y_pred) oblicza dokładność klasyfikatora na podstawie rzeczywistych etykiet (y) i przewidzianych wartości (y_pred).

0.5536656891495602

### LogisticRegression

In [18]:
model = LogisticRegression(solver='lbfgs') # Algorytm optymalizacji używany do dopasowania modelu. lbfgs to efektywny algorytm gradientowy dla małych i średnich zbiorów danych.
model.fit(X, y)
y_pred = model.predict(X)
accuracy_score(y, y_pred)

0.6058651026392962

In [19]:
df['gender_pred'] = y_pred
df['gender_pred'].value_counts() # To oznacza, że model zawsze zwrócił 1 (każde imię to imię męskie), bo akurat ta klasa była bardziej popularna. Zróbmy eksperyment, jeśli manualnie przypiszesz zawsze odpowiedź 1, to dostaniesz ten sam wynik.

gender_pred
1    1705
Name: count, dtype: int64

In [20]:
y_pred = [1]*X.shape[0] # ilość jedynek powinna zgadzać się z ilością wierszy w macierzy X
accuracy_score(y, y_pred)

0.6058651026392962

In [21]:
def train_and_predict_model(X, y, model, success_metric=accuracy_score):
    model.fit(X, y)
    y_pred = model.predict(X)

    print("Distribution:")
    print(pd.Series(y_pred).value_counts())

    return success_metric(y, y_pred)

In [22]:
vowels = ['a', 'ą', 'e', 'ę', 'i', 'o', 'u', 'y']

def how_many_vowels(name):
    count = sum(map(lambda x: int(x in vowels), name.lower()))

    return count

how_many_vowels('Jana')

2

### LogisticRegression 2.0

In [24]:
vowels = ['a', 'ą', 'e', 'ę', 'i', 'o', 'u', 'y']

def how_many_vowels(name):
    count = sum(map(lambda x: int(x in vowels), name.lower()))

    return count

df['count_vowels'] = df['name'].map(how_many_vowels)
train_and_predict_model(df[['dl', 'count_vowels'] ], y, LogisticRegression(solver='lbfgs'))

Distribution:
1    1082
0     623
Name: count, dtype: int64


0.7143695014662756

### LogisticRegression 3.0

In [25]:
vowels = ['a', 'ą', 'e', 'ę', 'i', 'o', 'u', 'y']

def first_is_vowel(name):
    return name.lower()[0] in vowels

df['first_is_vowel'] = df['name'].map(first_is_vowel)

train_and_predict_model(df[['dl', 'count_vowels', 'first_is_vowel'] ], y, LogisticRegression(solver='lbfgs'))

Distribution:
1    1106
0     599
Name: count, dtype: int64


0.7296187683284457

### Funkcja factorize()

In [27]:
labels, uniques = pd.factorize(['blue', 'green', 'yellow', 'blue'])

print("Labels:", labels)      # Tablica z zakodowanymi wartościami
print("Uniques:", uniques)    # Unikalne wartości

Labels: [0 1 2 0]
Uniques: ['blue' 'green' 'yellow']


  labels, uniques = pd.factorize(['blue', 'green', 'yellow', 'blue'])


In [28]:
pd.factorize(['blue', 'green', 'yellow', 'blue'])[0]

  pd.factorize(['blue', 'green', 'yellow', 'blue'])[0]


array([0, 1, 2, 0])

### LogisticRegression 4.0

In [30]:
df['first_letter'] = df['name'].map(lambda x: x.lower()[0])
df['first_letter'].head(5)

0    a
1    a
2    a
3    a
4    a
Name: first_letter, dtype: object

In [31]:
df['first_letter'] = df['name'].map(lambda x: x.lower()[0])
df['first_letter_cnt'] = df['first_letter'].factorize()[0]

X = df[['dl', 'count_vowels', 'first_is_vowel', 'first_letter_cnt'] ]
train_and_predict_model(X, y, LogisticRegression(solver='lbfgs'))

Distribution:
1    1106
0     599
Name: count, dtype: int64


0.7296187683284457

In [32]:
df['name'].map(lambda x: x.lower()[-1])

0       n
1       l
2       z
3       m
4       n
       ..
1700    a
1701    a
1702    a
1703    s
1704    t
Name: name, Length: 1705, dtype: object

### LogisticRegression 5.0

In [None]:
def get_all_vowels(name):
    all_vowels = [letter for letter in name.lower() if letter in vowels]

    return ''.join(all_vowels)

print(get_all_vowels('Piotr')) # io

df['all_vowels'] = df['name'].map(get_all_vowels)
df['all_vowels_cnt'] = pd.factorize(df['all_vowels'])[0]


X = df[['dl', 'count_vowels', 'first_is_vowel', 'first_letter_cnt', 'all_vowels_cnt'] ]
train_and_predict_model(X, y, LogisticRegression(solver='lbfgs'))

io
Distribution:
1    1129
0     576
Name: count, dtype: int64


0.73841642228739

In [None]:
def get_all_consonants(name):
    all_consonants = [letter for letter in name.lower() if letter not in vowels]

    return ''.join(all_consonants)

df['all_consonants'] = df['name'].map(get_all_consonants)
df['all_consonants_cnt'] = pd.factorize(df['all_consonants'])[0]

X = df[['dl', 'count_vowels', 'first_is_vowel', 'first_letter_cnt', 'all_consonants_cnt']]
train_and_predict_model(X, y, LogisticRegression(solver='lbfgs', max_iter=200))

Distribution:
1    1111
0     594
Name: count, dtype: int64


0.7313782991202346

### LogisticRegression 6.0

In [48]:
vowels = ['a', 'ą', 'e', 'ę', 'i', 'o', 'u', 'y']

def last_is_vowel(name):
    return name.lower()[-1] in vowels

df['last_is_vowel'] = df['name'].map(last_is_vowel)

X = df[['last_is_vowel']]
train_and_predict_model(X, y, LogisticRegression(solver='lbfgs', max_iter=200))

Distribution:
1    964
0    741
Name: count, dtype: int64


0.9524926686217009

In [49]:
feats = ['last_is_vowel', 'dl', 'count_vowels', 'first_is_vowel', 'all_vowels_cnt', 'all_consonants_cnt']
X = df[ feats ]
train_and_predict_model(X, y, LogisticRegression(solver='lbfgs', max_iter=200))

Distribution:
1    964
0    741
Name: count, dtype: int64


0.9524926686217009

In [50]:
df['lst_letter_a'] = df.name.map(lambda x: x[-1] == 'a')

df[ (df.gender == 'm') & df.lst_letter_a ]

Unnamed: 0,name,gender,target,dl,gender_pred,count_vowels,first_is_vowel,first_letter,first_letter_cnt,all_vowels,all_vowels_cnt,all_consonants,all_consonants_cnt,last_is_vowel,lst_letter_a
142,Barnaba,m,1,7,1,3,False,b,1,aaa,3,brnb,116,True,True
219,Bonawentura,m,1,11,1,5,False,b,1,oaeua,102,bnwntr,178,True,True
765,Kosma,m,1,5,1,2,False,k,9,oa,86,ksm,598,True,True
1574,Batszeba,m,1,8,1,3,False,b,1,aea,11,btszb,1201,True,True


In [51]:
df[ (df.gender == 'f') & (~df.lst_letter_a) ]

Unnamed: 0,name,gender,target,dl,gender_pred,count_vowels,first_is_vowel,first_letter,first_letter_cnt,all_vowels,all_vowels_cnt,all_consonants,all_consonants_cnt,last_is_vowel,lst_letter_a
156,Beatrice,f,0,8,1,4,False,b,1,eaie,67,btrc,129,True,False
157,Beatrycze,f,0,9,1,4,False,b,1,eaye,68,btrcz,130,True,False
171,Berenike,f,0,8,1,4,False,b,1,eeie,78,brnk,142,True,False
257,Carmen,f,0,6,1,2,False,c,2,ae,1,crmn,207,False,False
611,Inez,f,0,4,1,2,True,i,7,ie,226,nz,478,False,False
1215,Salome,f,0,6,1,3,False,s,17,aoe,157,slm,926,True,False
1570,Abigail,f,0,7,1,4,True,a,0,aiai,413,bgl,1197,False,False
1639,Ingrid,f,0,6,1,2,True,i,7,ii,228,ngrd,481,False,False
1663,Margot,f,0,6,1,2,False,m,12,ao,0,mrgt,1263,False,False
1666,Miriam,f,0,6,1,3,False,m,12,iia,249,mrm,1265,False,False
