In [24]:
import pandas as pd
data_url = "http://www-stat.wharton.upenn.edu/~waterman/DataSets/uva.txt"
df = pd.read_table(data_url)
df[:5]

Unnamed: 0,who,Newbie,Age,Gender,Household Income,Sexual Preference,Country,Education Attainment,Major Occupation,Marital Status,Years on Internet
0,id74364,0,54.0,Male,$50-74,Gay male,Ontario,Some College,Computer,Other,4-6 yr
1,id84505,0,39.0,Female,Over $100,Heterosexual,Sweden,Professional,Other,Other,1-3 yr
2,id84509,1,49.0,Female,$40-49,Heterosexual,Washington,Some College,Management,Other,Under 6 mo
3,id87028,1,22.0,Female,$40-49,Heterosexual,Florida,Some College,Computer,Married,6-12 mo
4,id76087,0,20.0,Male,$30-39,Bisexual,New Jersey,Some College,Education,Single,1-3 yr


In [25]:
df.pop('who')

0        id74364
1        id84505
2        id84509
3        id87028
4        id76087
          ...   
19578    id83400
19579    id72216
19580     id8654
19581    id84503
19582    id87674
Name: who, Length: 19583, dtype: object

In [26]:
df.pop('Country')

0           Ontario
1            Sweden
2        Washington
3           Florida
4        New Jersey
            ...    
19578         Texas
19579    New Jersey
19580      Missouri
19581      Kentucky
19582    California
Name: Country, Length: 19583, dtype: object

In [27]:
df.pop('Years on Internet')

0            4-6 yr
1            1-3 yr
2        Under 6 mo
3           6-12 mo
4            1-3 yr
            ...    
19578        4-6 yr
19579        4-6 yr
19580        1-3 yr
19581    Under 6 mo
19582        1-3 yr
Name: Years on Internet, Length: 19583, dtype: object

In [28]:
df.dtypes

Newbie                    int64
Age                     float64
Gender                   object
Household Income         object
Sexual Preference        object
Education Attainment     object
Major Occupation         object
Marital Status           object
dtype: object

In [29]:
category_cols = ["Gender", "Household Income", 'Sexual Preference', 'Education Attainment', 'Major Occupation', 'Marital Status']

for col in category_cols:
    df[col] = df[col].astype('category')
    
df.dtypes

Newbie                     int64
Age                      float64
Gender                  category
Household Income        category
Sexual Preference       category
Education Attainment    category
Major Occupation        category
Marital Status          category
dtype: object

In [30]:
df_onehot = pd.get_dummies(df)
df_onehot.shape

(19583, 38)

In [31]:
df_onehot.isnull().sum()

Newbie                                 0
Age                                  561
Gender_Female                          0
Gender_Male                            0
Household Income_$10-19                0
Household Income_$20-29                0
Household Income_$30-39                0
Household Income_$40-49                0
Household Income_$50-74                0
Household Income_$75-99                0
Household Income_Over $100             0
Household Income_Under $10             0
Sexual Preference_Bisexual             0
Sexual Preference_Gay male             0
Sexual Preference_Heterosexual         0
Sexual Preference_Lesbian              0
Sexual Preference_Transgender          0
Sexual Preference_na                   0
Education Attainment_College           0
Education Attainment_Doctoral          0
Education Attainment_Grammar           0
Education Attainment_High School       0
Education Attainment_Masters           0
Education Attainment_Other             0
Education Attain

In [32]:
df_onehot.loc[pd.isnull(df_onehot['Age']), "Age"] = df_onehot['Age'].mean()

In [33]:
x_data = df_onehot.iloc[:, 1:].values
y_data = df_onehot.iloc[:, 0].values.reshape(-1, 1)
y_data.shape, x_data.shape

((19583, 1), (19583, 37))

In [34]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
x_data = min_max_scaler.fit_transform(x_data)

In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.33, random_state=42)

X_train.shape, X_test.shape

((13120, 37), (6463, 37))

In [36]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(fit_intercept=True)
logreg.fit(X_train, y_train.flatten())

In [37]:
LogisticRegression(C=1.0, class_weight=None,
                  dual=False, fit_intercept=True,
                  intercept_scaling=1, l1_ratio=None, max_iter=100,
                  multi_class='warn', n_jobs=None, penalty='l2',
                  random_state=None, solver='warn', tol=0.0001,
                  verbose=0, warm_start=False)

In [38]:
logreg.predict(X_test[:5])

array([0, 0, 0, 0, 0], dtype=int64)

In [39]:
logreg.predict_proba(X_test[:5])

array([[0.56843258, 0.43156742],
       [0.91112572, 0.08887428],
       [0.79481085, 0.20518915],
       [0.85841562, 0.14158438],
       [0.62764603, 0.37235397]])

In [40]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

y_true = y_test.copy()
y_pred = logreg.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[4487,  275],
       [1350,  351]], dtype=int64)

In [41]:
accuracy_score(y_true, y_pred)

0.7485687761101656