## Retrieving and cleaning data


In [1]:
!pip install sqlalchemy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
from sqlalchemy import create_engine, inspect

In [3]:
# Create a connection to the MySQL database using SQLAlchemy
engine = create_engine('mysql+pymysql://root:@localhost/group12')

# Use the inspect module to get table names
inspector = inspect(engine)
tables = inspector.get_table_names()

# Print the list of all tables
print(tables)

['calibration', 'pre_propositions', 'propositions', 'recommendation', 'userconsent', 'userdemograph', 'userpresatisfaction', 'users', 'usersatisfaction', 'userseconddemograph', 'userselection']


In [22]:
# Query to select all items from userconsent table
query = "SELECT * FROM pre_propositions"

# Load data into a pandas DataFrame
pre_propositions = pd.read_sql(query, engine)
prefix = 'pre_'
string_columns = ['id']

for col in string_columns:
    pre_propositions[col] = pre_propositions[col].apply(lambda x: f"{prefix}{x}")

# Print the DataFrame
print(pre_propositions)

        id                                           question  scale
0    pre_1      I am considerate and kind to almost everyone.      7
1    pre_2                   I like to cooperate with others.      7
2    pre_3            I am helpful and unselfish with others.      7
3    pre_4                         I have a forgiving nature.      7
4    pre_5                           I am generally trusting.      7
5   pre_11      When I have made a decision, I feel relieved.      7
6   pre_12  When I am confronted with a problem, I’m dying...      7
7   pre_13  I would quickly become impatient and irritated...      7
8   pre_14  I would rather make a decision quickly than sl...      7
9   pre_15  Even if I get a lot of time to make a decision...      7
10  pre_16  I almost always feel hurried to reach a decisi...      7


In [23]:
# Query to select all items from userconsent table
query = "SELECT * FROM propositions"

# Load data into a pandas DataFrame
propositions = pd.read_sql(query, engine)
prefix = 'post_'
string_columns = ['id']

for col in string_columns:
    propositions[col] = propositions[col].apply(lambda x: f"{prefix}{x}")

# Print the DataFrame
print(propositions)

         id                                           question  scale
0    post_1   I think the book recommendations are credible...      7
1    post_2                  I trust the book recommendations.      7
2    post_3   I believe the book recommendations are trustw...      7
3   post_11  I intend to read the book I chose from the rec...      7
4   post_12  I will follow up and look for information abou...      7
5   post_21        I am familiar with the author of this book.      7
6   post_22  I am familiar with the genre this book belongs...      7
7   post_23      I have read books similar to the one I chose.      7
8   post_24  This is an attention check. Please select 'Str...      7
9   post_31              I am satisfied with the book I chose.      7
10  post_32       I would recommend the chosen book to others.      7
11  post_33     I think I would enjoy reading the chosen book.      7
12  post_34  I would rather pick a different book from the ...      7
13  post_35    I thi

In [4]:
# Query to select all items from userconsent table
query = "SELECT * FROM userconsent"

# Load data into a pandas DataFrame
userconsent = pd.read_sql(query, engine)

# Print the DataFrame
print(userconsent)

   userId  consent
0     536        1
1     545        1


In [5]:
# Query to select all items from userconsent table
query = "SELECT * FROM userdemograph"

# Load data into a pandas DataFrame
userdemograph = pd.read_sql(query, engine)

# Print the DataFrame
print(userdemograph)

   userId  leeftijd  gender  opleiding  reading  genre
0     550        12       2          3        2      2


In [6]:
# Function to add prefix to all columns except 'userId'
def add_prefix_except_userid(df, prefix):
    return df.rename(columns={col: f"{prefix}{col}" if col != "userId" else col for col in df.columns})

In [7]:
# Query to select all items from userconsent table
query = "SELECT * FROM userpresatisfaction"

# Load data into a pandas DataFrame
userpresatisfaction = pd.read_sql(query, engine)

# Print the DataFrame
print(userpresatisfaction)

     userId  questionId  value
0       534           3      4
1       534           2      4
2       534           1      4
3       534           5      4
4       534          12      2
..      ...         ...    ...
138     546          13      2
139     546          15      2
140     546          12      2
141     546          14      2
142     546          16      2

[143 rows x 3 columns]


In [8]:
userpresatisfaction_wide = userpresatisfaction.pivot(index='userId', columns='questionId', values='value').reset_index()
userpresatisfaction_wide.columns.name = None

In [9]:
# Add prefixes to column names except 'userId'
userpresatisfaction_wide = add_prefix_except_userid(userpresatisfaction_wide, 'pre_')


In [10]:
# Query to select all items from userconsent table
#query = "SELECT * FROM users"

# Load data into a pandas DataFrame
#users = pd.read_sql(query, engine)

# Print the DataFrame
#print(users)

In [11]:
# Query to select all items from userconsent table
query = "SELECT * FROM usersatisfaction"

# Load data into a pandas DataFrame
usersatisfaction = pd.read_sql(query, engine)

# Print the DataFrame
print(usersatisfaction)

   userId  questionId  value                time
0     545           5      2 2024-06-18 14:03:57
1     545           4      2 2024-06-18 14:03:57
2     545          14      2 2024-06-18 14:03:57
3     545          15      2 2024-06-18 14:03:57
4     545          16      2 2024-06-18 14:03:57
5     545          13      2 2024-06-18 14:03:57


In [12]:
usersatisfaction_wide = usersatisfaction.pivot(index='userId', columns='questionId', values='value').reset_index()
usersatisfaction_wide.columns.name = None

In [13]:
# Add prefixes to column names except 'userId'
usersatisfaction_wide = add_prefix_except_userid(usersatisfaction_wide, 'post_')

In [14]:
# Query to select all items from userconsent table
query = "SELECT * FROM userseconddemograph"

# Load data into a pandas DataFrame
userseconddemograph = pd.read_sql(query, engine)

# Print the DataFrame
print(userseconddemograph)

   userId  mancheck                time
0     536         1 2024-06-18 13:40:52
1     543         1 2024-06-18 14:00:05
2     544         0 2024-06-18 14:02:45
3     545         1 2024-06-18 14:03:49


In [15]:
# Query to select all items from userconsent table
query = "SELECT * FROM userselection"

# Load data into a pandas DataFrame
userselection = pd.read_sql(query, engine)

# Print the DataFrame
print(userselection)


   userId  conditie  calibrationId  recommendationId
0     536         0            711              1087
1     542         1           1001              1051
2     543         0           1380               210
3     544         1            709               727
4     545         0            711              1628


In [16]:
dfs = [userconsent, userdemograph, userpresatisfaction_wide, usersatisfaction_wide, userseconddemograph, userselection]

# Merge all DataFrames on the userId column
merged_df = dfs[0]
for df in dfs[1:]:
    merged_df = pd.merge(merged_df, df, on='userId', how="outer")


In [25]:
merged_df.columns

Index(['userId', 'consent', 'leeftijd', 'gender', 'opleiding', 'reading',
       'genre', 'pre_1', 'pre_2', 'pre_3', 'pre_4', 'pre_5', 'pre_11',
       'pre_12', 'pre_13', 'pre_14', 'pre_15', 'pre_16', 'post_4', 'post_5',
       'post_13', 'post_14', 'post_15', 'post_16', 'mancheck', 'time',
       'conditie', 'calibrationId', 'recommendationId'],
      dtype='object')

In [26]:
merged_df.to_csv('final_df.csv', index=False)