# Machine Learning - Homework 2

In [None]:
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix

In [None]:
def read_csv_and_analyze_dataset(path_to_csv):
  dataframe = pd.read_csv(path_to_csv)

  print("The dataset:")
  print(dataframe, end='\n\n')

  print("All the unique values for all columns that contain discrete values:")

  for column_name in dataframe:
    if not pd.api.types.is_numeric_dtype(dataframe[column_name]):
      print(f"\n{column_name}:")

      for unique_value in dataframe[column_name].unique():
        print(f"{unique_value} ({len(dataframe.loc[dataframe[column_name] == unique_value])})")

  print()

  return dataframe

def transform_binary_and_nominal_columns(dataframe, column_names):
  transformer = make_column_transformer(
    (OneHotEncoder(drop='if_binary'), column_names),
    remainder='passthrough'
  )

  transformed = transformer.fit_transform(dataframe)

  return pd.DataFrame(
      transformed,
      columns=transformer.get_feature_names_out()
  )

def transform_ordinal_columns(dataframe, column_names):
  dataframe[column_names] = OrdinalEncoder().fit_transform(
      dataframe[column_names]
  )

  return dataframe

def train_and_test_model(X, y, model):
  X_train, X_test, y_train, y_test = train_test_split(
      X,
      y,
      test_size=0.2,
      shuffle=True
  )

  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  if type(model).__name__ == 'LinearRegression':
    print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
    print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
  else:
    print("Accuracy: %.2f" % model.score(X_test, y_test))
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))


## Linear Regression

Linear model for predicting the amount of the tip for a restaurant meal.

The dataset was taken from "Dianne Cook, Deborah F. Swayne, Interactive and Dynamic Graphics for Data Analysis: With R and GGobi, Springer, 2007" (7.1 Tips, pp. 153).

### Analyze the dataset

In [None]:
tips_df = read_csv_and_analyze_dataset('tips.csv')

The dataset:
     total_bill   tip     sex smoker   day    time  size
0         16.99  1.01  Female     No   Sun  Dinner     2
1         10.34  1.66    Male     No   Sun  Dinner     3
2         21.01  3.50    Male     No   Sun  Dinner     3
3         23.68  3.31    Male     No   Sun  Dinner     2
4         24.59  3.61  Female     No   Sun  Dinner     4
..          ...   ...     ...    ...   ...     ...   ...
239       29.03  5.92    Male     No   Sat  Dinner     3
240       27.18  2.00  Female    Yes   Sat  Dinner     2
241       22.67  2.00    Male    Yes   Sat  Dinner     2
242       17.82  1.75    Male     No   Sat  Dinner     2
243       18.78  3.00  Female     No  Thur  Dinner     2

[244 rows x 7 columns]

All the unique values for all columns that contain discrete values:

sex:
Female (87)
Male (157)

smoker:
No (151)
Yes (93)

day:
Sun (76)
Sat (87)
Thur (62)
Fri (19)

time:
Dinner (176)
Lunch (68)



### Preprocess the dataset

In [None]:
tips_df = tips_df.replace({'day': { 'Thur': 4, 'Fri': 5, 'Sat': 6, 'Sun': 7 }})

binary_column_names = ['sex', 'smoker', 'time']

tips_df = transform_binary_and_nominal_columns(tips_df, binary_column_names)

tips_df

Unnamed: 0,onehotencoder__sex_Male,onehotencoder__smoker_Yes,onehotencoder__time_Lunch,remainder__total_bill,remainder__tip,remainder__day,remainder__size
0,0.0,0.0,0.0,16.99,1.01,7.0,2.0
1,1.0,0.0,0.0,10.34,1.66,7.0,3.0
2,1.0,0.0,0.0,21.01,3.50,7.0,3.0
3,1.0,0.0,0.0,23.68,3.31,7.0,2.0
4,0.0,0.0,0.0,24.59,3.61,7.0,4.0
...,...,...,...,...,...,...,...
239,1.0,0.0,0.0,29.03,5.92,6.0,3.0
240,0.0,1.0,0.0,27.18,2.00,6.0,2.0
241,1.0,1.0,0.0,22.67,2.00,6.0,2.0
242,1.0,0.0,0.0,17.82,1.75,6.0,2.0


### Train and test the model

In [None]:
y = tips_df.pop('remainder__tip')

In [None]:
train_and_test_model(tips_df, y, LinearRegression())

Mean squared error: 0.66
Coefficient of determination: 0.49


## Logistic Regression

Logistic Regression model for discriminating between a rock and a classical song.

The dataset was taken from "Dianne Cook, Deborah F. Swayne, Interactive and Dynamic Graphics for Data Analysis: With R and GGobi, Springer, 2007" (7.12 Music, pp. 171).

### Analyze the dataset

In [None]:
music_df = read_csv_and_analyze_dataset('music-sub.csv')

The dataset:
       Unnamed: 0   Artist       Type        LVar       LAve   LMax  \
0   Dancing Queen     Abba       Rock  17600755.6 -90.006867  29921   
1      Knowing Me     Abba       Rock   9543020.9 -75.766719  27626   
2   Take a Chance     Abba       Rock   9049481.5 -98.062924  26372   
3       Mamma Mia     Abba       Rock   7557437.3 -90.471062  28898   
4     Lay All You     Abba       Rock   6282285.6 -88.952631  27940   
..            ...      ...        ...         ...        ...    ...   
57       Waterloo     Abba       Rock  24898675.9 -93.996187  29830   
58            V11  Vivaldi  Classical   1879989.2  12.721337   8601   
59            V12  Vivaldi  Classical    737349.6   5.719002   7089   
60            V13  Vivaldi  Classical   2865979.9  21.446763  17282   
61       Hey Jude  Beatles       Rock   8651854.1  -6.132241  18509   

       LFEner      LFreq  
0   105.92095   59.57379  
1   102.83616   58.48031  
2   102.32488  124.59397  
3   101.61648   48.76513  

### Preprocess the dataset

In [None]:
music_df = music_df.drop('Unnamed: 0', axis=1)

# This column is not actually ordinal, but we will need to apply an 'ordinal'
# transformation to it in order to use it as 'y' in a multiclass logistic
# regression model
type_column_name = ['Type']

music_df = transform_ordinal_columns(music_df, type_column_name)

nominal_column_names = ['Artist']

music_df = transform_binary_and_nominal_columns(music_df, nominal_column_names)

music_df

Unnamed: 0,onehotencoder__Artist_Abba,onehotencoder__Artist_Beatles,onehotencoder__Artist_Beethoven,onehotencoder__Artist_Eels,onehotencoder__Artist_Enya,onehotencoder__Artist_Mozart,onehotencoder__Artist_Vivaldi,remainder__Type,remainder__LVar,remainder__LAve,remainder__LMax,remainder__LFEner,remainder__LFreq
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,17600755.6,-90.006867,29921.0,105.92095,59.57379
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,9543020.9,-75.766719,27626.0,102.83616,58.48031
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,9049481.5,-98.062924,26372.0,102.32488,124.59397
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,7557437.3,-90.471062,28898.0,101.61648,48.76513
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,6282285.6,-88.952631,27940.0,100.30076,74.02039
...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,24898675.9,-93.996187,29830.0,107.73299,146.04306
58,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1879989.2,12.721337,8601.0,105.81750,58.83780
59,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,737349.6,5.719002,7089.0,102.92123,175.94562
60,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2865979.9,21.446763,17282.0,102.11314,61.44533


### Train and test the model

In [None]:
y = music_df.pop('remainder__Type')

In [None]:
train_and_test_model(music_df, y, LogisticRegression(multi_class='multinomial', solver='lbfgs'))

Accuracy: 0.62
Confusion matrix:
[[4 0 3]
 [1 0 0]
 [1 0 4]]
