# Movie recommendation

Adding 'movies.dat' file to the project

In [2]:
import numpy as np
import pandas as pd
from pyhere import here

In [116]:
ratings_data_path = here('data', 'ch02', 'ml-1m/ratings.dat')
df_ratings = pd.read_csv(ratings_data_path, header=None, sep='::', engine='python')
df_ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']

In [117]:
df_ratings.head()


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [118]:
movies_data_path = str(here('data', 'ch02', 'ml-1m/movies.dat'))
df_movies = pd.read_csv(movies_data_path, header=None, sep='::', engine='python', encoding='ISO-8859-1')


In [119]:
df_movies.columns = ['movie_id', 'title', 'genres']
df_movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [120]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  3883 non-null   int64 
 1   title     3883 non-null   object
 2   genres    3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [121]:
df_users = pd.read_csv(str(here('data', 'ch02', 'ml-1m/users.dat')), header=None, sep='::', engine='python')
df_users.columns = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
df_users.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [122]:
df = pd.merge(df_ratings[['user_id', 'movie_id', 'rating']], df_movies[['movie_id', 'genres']], on='movie_id', how='left')
df.head()

Unnamed: 0,user_id,movie_id,rating,genres
0,1,1193,5,Drama
1,1,661,3,Animation|Children's|Musical
2,1,914,3,Musical|Romance
3,1,3408,4,Drama
4,1,2355,5,Animation|Children's|Comedy


In [123]:
df_main = pd.merge(df, df_users[['user_id', 'gender', 'age', 'occupation']], on='user_id', how='left')
df_main.head()

Unnamed: 0,user_id,movie_id,rating,genres,gender,age,occupation
0,1,1193,5,Drama,F,1,10
1,1,661,3,Animation|Children's|Musical,F,1,10
2,1,914,3,Musical|Romance,F,1,10
3,1,3408,4,Drama,F,1,10
4,1,2355,5,Animation|Children's|Comedy,F,1,10


In [124]:
df_main['genres'].value_counts()

genres
Comedy                       116883
Drama                        111423
Comedy|Romance                42712
Comedy|Drama                  42245
Drama|Romance                 29170
                              ...  
Drama|Romance|Western            29
Children's|Fantasy               27
Comedy|Film-Noir|Thriller         5
Film-Noir|Horror                  2
Fantasy                           1
Name: count, Length: 301, dtype: int64

In [125]:
pipe_mask = df_main['genres'].str.contains(r'\|') 
df_main.loc[pipe_mask, 'genres'].value_counts()


genres
Comedy|Romance               42712
Comedy|Drama                 42245
Drama|Romance                29170
Action|Thriller              26759
Drama|Thriller               18248
                             ...  
Adventure|Musical|Romance       30
Drama|Romance|Western           29
Children's|Fantasy              27
Comedy|Film-Noir|Thriller        5
Film-Noir|Horror                 2
Name: count, Length: 283, dtype: int64

In the genres column, we have a pipe (|) character that separates the genres. We can use the str.split() method to split the genres into a list of genres. We can then use the explode() method to create a new row for each genre. This will allow us to analyze the genres in a more granular man.

In [126]:
df_main

Unnamed: 0,user_id,movie_id,rating,genres,gender,age,occupation
0,1,1193,5,Drama,F,1,10
1,1,661,3,Animation|Children's|Musical,F,1,10
2,1,914,3,Musical|Romance,F,1,10
3,1,3408,4,Drama,F,1,10
4,1,2355,5,Animation|Children's|Comedy,F,1,10
...,...,...,...,...,...,...,...
1000204,6040,1091,1,Comedy,M,25,6
1000205,6040,1094,5,Drama|Romance|War,M,25,6
1000206,6040,562,5,Comedy|Drama,M,25,6
1000207,6040,1096,4,Drama,M,25,6


In [127]:
# Create a genres mapping dictionary
# get a list of single genres
genres_list = df_main['genres'].to_list()
genres_list[0:10]

['Drama',
 "Animation|Children's|Musical",
 'Musical|Romance',
 'Drama',
 "Animation|Children's|Comedy",
 'Action|Adventure|Comedy|Romance',
 'Action|Adventure|Drama',
 'Comedy|Drama',
 "Animation|Children's|Musical",
 "Adventure|Children's|Drama|Musical"]

In [128]:
# create an empty set to keep track of unique genres
unique_genres = set()
# loop through the genres list and split each genre string into a list of genres
for g in genres_list:
    if "|" in g:
        genre = g.split("|")
        for genre in g.split("|"):
            unique_genres.add(genre)
    else:
        unique_genres.add(g)

unique_genres

{'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [129]:
# Create a genere mapping dictionary
genre_mapping = {genre: i for i, genre in enumerate(unique_genres)}
genre_mapping

{'Adventure': 0,
 'Film-Noir': 1,
 'Action': 2,
 'Fantasy': 3,
 'Western': 4,
 'Musical': 5,
 'Documentary': 6,
 'Animation': 7,
 'Romance': 8,
 'Crime': 9,
 'Sci-Fi': 10,
 'Horror': 11,
 'Mystery': 12,
 'Drama': 13,
 'Comedy': 14,
 'Thriller': 15,
 "Children's": 16,
 'War': 17}

In [130]:
# create a new dataframe that has the genres as columns
df_genres = df_main.copy()
df_genres.head()

Unnamed: 0,user_id,movie_id,rating,genres,gender,age,occupation
0,1,1193,5,Drama,F,1,10
1,1,661,3,Animation|Children's|Musical,F,1,10
2,1,914,3,Musical|Romance,F,1,10
3,1,3408,4,Drama,F,1,10
4,1,2355,5,Animation|Children's|Comedy,F,1,10


In [131]:
new_columns = [key.lower() for key in genre_mapping.keys()]
# new_columns
initial_values =np.zeros(len(df_genres), dtype=int)
for col in new_columns:
    df_genres[col] = initial_values
df_genres.head()


Unnamed: 0,user_id,movie_id,rating,genres,gender,age,occupation,adventure,film-noir,action,...,romance,crime,sci-fi,horror,mystery,drama,comedy,thriller,children's,war
0,1,1193,5,Drama,F,1,10,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,661,3,Animation|Children's|Musical,F,1,10,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,914,3,Musical|Romance,F,1,10,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,3408,4,Drama,F,1,10,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,2355,5,Animation|Children's|Comedy,F,1,10,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [132]:
# Initialise the new genres column to 1 if it is present in the original dataframe
# df_genres['drama'] = np.where(df_genres['genres'].str.lower().str.contains('drama'), 1, 0)
# df_genres['adventure'] = np.where(df_main['genre'].str.lower().str.contains('adventure'), 1, 0)
# df_genres['animation'] = np.where(df_main['genre'].str.lower().strcontains('animation'), 1, 0)
# df_genres['comedy'] = np.where(df_main['genre'].str.lower().str.contains('comedy'), 1, 0)
# df_genres['crime'] = np.where(df_main['genre'].str.lower().strcontains('crime'), 1, 0)
# df_genres['action'] = np.where(df_main['genre'].str.lower().str.contains('action'), 1, 0)
# df_genres['fantasy'] = np.where(df_main['genre'].str.lower().strcontains('fantasy'), 1, 0)
# df_genres['film-noir'] = np.where(df_main['genre'].str.lower().str.contains('film-noir'), 1, 0)
# df_genres['horror'] = np.where(df_main['genre'].str.lower().str.contains('horror'), 1, 0)
# df_genres['mystery'] = np.where(df_main['genre'].str.lower().str.contains('mystery'), 1, 0)
# df_genres['romance'] = np.where(df_main['genre'].str.lower().str.contains('romance'), 1, 0)
# df_genres['science-fiction'] = np.where(df_main['genre'].str.lower().str.contains('science-fiction'), 1, 0)
# df_genres['thriller'] = np.where(df_main['genre'].str.lower().str.contains('thriller'), 1, 0)
# should functionalise the above

In [133]:
for col in new_columns:
    df_genres[col] = np.where(df_genres['genres'].str.lower().str.contains(col), 1, 0)

In [134]:
df_genres.head()

Unnamed: 0,user_id,movie_id,rating,genres,gender,age,occupation,adventure,film-noir,action,...,romance,crime,sci-fi,horror,mystery,drama,comedy,thriller,children's,war
0,1,1193,5,Drama,F,1,10,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,661,3,Animation|Children's|Musical,F,1,10,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,914,3,Musical|Romance,F,1,10,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,1,3408,4,Drama,F,1,10,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,2355,5,Animation|Children's|Comedy,F,1,10,0,0,0,...,0,0,0,0,0,0,1,0,1,0


In [135]:
# update change gender coding F = 0 M = 1
df_genres['gender'] = np.where(df_genres['gender'].str.lower() == 'f', 0, 1)
df_genres.head()


Unnamed: 0,user_id,movie_id,rating,genres,gender,age,occupation,adventure,film-noir,action,...,romance,crime,sci-fi,horror,mystery,drama,comedy,thriller,children's,war
0,1,1193,5,Drama,0,1,10,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,661,3,Animation|Children's|Musical,0,1,10,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,914,3,Musical|Romance,0,1,10,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,1,3408,4,Drama,0,1,10,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,2355,5,Animation|Children's|Comedy,0,1,10,0,0,0,...,0,0,0,0,0,0,1,0,1,0


In [136]:
df_genres['rating'].value_counts()

rating
4    348971
3    261197
5    226310
2    107557
1     56174
Name: count, dtype: int64

In [137]:
df_genres['rating'].describe()

count    1.000209e+06
mean     3.581564e+00
std      1.117102e+00
min      1.000000e+00
25%      3.000000e+00
50%      4.000000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [138]:
df_genres

Unnamed: 0,user_id,movie_id,rating,genres,gender,age,occupation,adventure,film-noir,action,...,romance,crime,sci-fi,horror,mystery,drama,comedy,thriller,children's,war
0,1,1193,5,Drama,0,1,10,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,661,3,Animation|Children's|Musical,0,1,10,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,914,3,Musical|Romance,0,1,10,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,1,3408,4,Drama,0,1,10,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,2355,5,Animation|Children's|Comedy,0,1,10,0,0,0,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,Comedy,1,25,6,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1000205,6040,1094,5,Drama|Romance|War,1,25,6,0,0,0,...,1,0,0,0,0,1,0,0,0,1
1000206,6040,562,5,Comedy|Drama,1,25,6,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1000207,6040,1096,4,Drama,1,25,6,0,0,0,...,0,0,0,0,0,1,0,0,0,0


At this stage, the plan would be to summarise the generes columns for each user such that each row in the data frame represents a single user.

How should the genres be summarise?
1.Sum up each column
2. Calculate a proportion of each genre for each user

In [139]:
unique_genres = [g.lower() for g in unique_genres]
unique_genres

['adventure',
 'film-noir',
 'action',
 'fantasy',
 'western',
 'musical',
 'documentary',
 'animation',
 'romance',
 'crime',
 'sci-fi',
 'horror',
 'mystery',
 'drama',
 'comedy',
 'thriller',
 "children's",
 'war']

In [140]:
 df_genres_agg = df_genres[['user_id'] + unique_genres].groupby('user_id').sum().reset_index()
# genres_agg_df.columns = ['user_id'] + ['count_' + col for col in]

In [141]:
df_genres_agg.head()

Unnamed: 0,user_id,adventure,film-noir,action,fantasy,western,musical,documentary,animation,romance,crime,sci-fi,horror,mystery,drama,comedy,thriller,children's,war
0,1,5,0,5,3,0,14,0,18,6,2,3,0,0,21,14,3,20,2
1,2,19,1,56,1,3,0,0,0,24,12,17,2,3,79,25,31,0,15
2,3,25,0,23,2,6,1,0,3,5,0,6,3,1,8,30,5,3,2
3,4,6,0,19,2,2,0,0,0,2,1,9,3,0,6,0,4,1,3
4,5,9,3,31,0,1,3,6,4,30,21,15,10,8,104,56,39,6,6


In [153]:
# normalise the counts
# total_movies_per_user = df_genres.groupby('user_id')['user_id'].value_counts()
# total_movies_per_user
# I think the above is wrong. It should be the sum of all genres rating per user
# so need to row sum and divide df_genres_agg by this value
row_sum_normalizer = np.sum(df_genres_agg[unique_genres], axis=1)
row_sum_normalizer


0        116
1        288
2        123
3         58
4        352
        ... 
6035    1691
6036     408
6037      41
6038     277
6039     603
Length: 6040, dtype: int64

In [156]:
type(row_sum_normalizer)

pandas.core.series.Series

In [146]:
df_genres_to_normalize = df_genres_agg.loc[:, unique_genres]
df_genres_to_normalize.head()

Unnamed: 0,adventure,film-noir,action,fantasy,western,musical,documentary,animation,romance,crime,sci-fi,horror,mystery,drama,comedy,thriller,children's,war
0,5,0,5,3,0,14,0,18,6,2,3,0,0,21,14,3,20,2
1,19,1,56,1,3,0,0,0,24,12,17,2,3,79,25,31,0,15
2,25,0,23,2,6,1,0,3,5,0,6,3,1,8,30,5,3,2
3,6,0,19,2,2,0,0,0,2,1,9,3,0,6,0,4,1,3
4,9,3,31,0,1,3,6,4,30,21,15,10,8,104,56,39,6,6


In [155]:
df_genres_normalized = df_genres_to_normalize.div(row_sum_normalizer, axis=0)
df_genres_normalized.head()

Unnamed: 0,adventure,film-noir,action,fantasy,western,musical,documentary,animation,romance,crime,sci-fi,horror,mystery,drama,comedy,thriller,children's,war
0,0.043103,0.0,0.043103,0.025862,0.0,0.12069,0.0,0.155172,0.051724,0.017241,0.025862,0.0,0.0,0.181034,0.12069,0.025862,0.172414,0.017241
1,0.065972,0.003472,0.194444,0.003472,0.010417,0.0,0.0,0.0,0.083333,0.041667,0.059028,0.006944,0.010417,0.274306,0.086806,0.107639,0.0,0.052083
2,0.203252,0.0,0.186992,0.01626,0.04878,0.00813,0.0,0.02439,0.04065,0.0,0.04878,0.02439,0.00813,0.065041,0.243902,0.04065,0.02439,0.01626
3,0.103448,0.0,0.327586,0.034483,0.034483,0.0,0.0,0.0,0.034483,0.017241,0.155172,0.051724,0.0,0.103448,0.0,0.068966,0.017241,0.051724
4,0.025568,0.008523,0.088068,0.0,0.002841,0.008523,0.017045,0.011364,0.085227,0.059659,0.042614,0.028409,0.022727,0.295455,0.159091,0.110795,0.017045,0.017045


In [103]:
genres_agg_df.loc[0, unique_genres].sum()

np.int64(116)

In [160]:
# Join user_id back to normalised datafram
df_genres_agg

Unnamed: 0,user_id,adventure,film-noir,action,fantasy,western,musical,documentary,animation,romance,crime,sci-fi,horror,mystery,drama,comedy,thriller,children's,war
0,1,5,0,5,3,0,14,0,18,6,2,3,0,0,21,14,3,20,2
1,2,19,1,56,1,3,0,0,0,24,12,17,2,3,79,25,31,0,15
2,3,25,0,23,2,6,1,0,3,5,0,6,3,1,8,30,5,3,2
3,4,6,0,19,2,2,0,0,0,2,1,9,3,0,6,0,4,1,3
4,5,9,3,31,0,1,3,6,4,30,21,15,10,8,104,56,39,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,6036,83,17,154,19,14,31,11,34,122,53,169,74,34,372,261,147,54,42
6036,6037,9,9,28,4,4,4,1,1,22,18,39,9,13,98,59,68,6,16
6037,6038,1,0,2,0,0,0,0,3,6,0,1,2,0,9,12,0,1,4
6038,6039,10,6,8,5,2,42,0,13,30,2,8,1,17,28,65,14,17,9


In [161]:
df_genres_agg_normalized = pd.merge(df_genres_agg['user_id' ], df_genres_normalized, left_index=True, right_index=True)
df_genres_agg_normalized.head()

Unnamed: 0,user_id,adventure,film-noir,action,fantasy,western,musical,documentary,animation,romance,crime,sci-fi,horror,mystery,drama,comedy,thriller,children's,war
0,1,0.043103,0.0,0.043103,0.025862,0.0,0.12069,0.0,0.155172,0.051724,0.017241,0.025862,0.0,0.0,0.181034,0.12069,0.025862,0.172414,0.017241
1,2,0.065972,0.003472,0.194444,0.003472,0.010417,0.0,0.0,0.0,0.083333,0.041667,0.059028,0.006944,0.010417,0.274306,0.086806,0.107639,0.0,0.052083
2,3,0.203252,0.0,0.186992,0.01626,0.04878,0.00813,0.0,0.02439,0.04065,0.0,0.04878,0.02439,0.00813,0.065041,0.243902,0.04065,0.02439,0.01626
3,4,0.103448,0.0,0.327586,0.034483,0.034483,0.0,0.0,0.0,0.034483,0.017241,0.155172,0.051724,0.0,0.103448,0.0,0.068966,0.017241,0.051724
4,5,0.025568,0.008523,0.088068,0.0,0.002841,0.008523,0.017045,0.011364,0.085227,0.059659,0.042614,0.028409,0.022727,0.295455,0.159091,0.110795,0.017045,0.017045


Unnamed: 0,user_id,movie_id,rating,genres
0,1,1193,5,Drama
1,1,661,3,Animation|Children's|Musical
2,1,914,3,Musical|Romance
3,1,3408,4,Drama
4,1,2355,5,Animation|Children's|Comedy
...,...,...,...,...
1000204,6040,1091,1,Comedy
1000205,6040,1094,5,Drama|Romance|War
1000206,6040,562,5,Comedy|Drama
1000207,6040,1096,4,Drama


In [163]:
df_main

Unnamed: 0,user_id,movie_id,rating,genres,gender,age,occupation
0,1,1193,5,Drama,F,1,10
1,1,661,3,Animation|Children's|Musical,F,1,10
2,1,914,3,Musical|Romance,F,1,10
3,1,3408,4,Drama,F,1,10
4,1,2355,5,Animation|Children's|Comedy,F,1,10
...,...,...,...,...,...,...,...
1000204,6040,1091,1,Comedy,M,25,6
1000205,6040,1094,5,Drama|Romance|War,M,25,6
1000206,6040,562,5,Comedy|Drama,M,25,6
1000207,6040,1096,4,Drama,M,25,6


In [169]:
df_collapsed_per_user = df_main.groupby('user_id')[['gender', 'age', 'occupation']].first()
df_collapsed_per_user

Unnamed: 0_level_0,gender,age,occupation
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,F,1,10
2,M,56,16
3,M,25,15
4,M,45,7
5,M,25,20
...,...,...,...
6036,F,25,15
6037,F,45,1
6038,F,56,1
6039,F,45,0


In [172]:
df_per_user_normalized = pd.merge(df_collapsed_per_user, df_genres_agg_normalized, on='user_id')
df_per_user_normalized.head()

Unnamed: 0,user_id,gender,age,occupation,adventure,film-noir,action,fantasy,western,musical,...,romance,crime,sci-fi,horror,mystery,drama,comedy,thriller,children's,war
0,1,F,1,10,0.043103,0.0,0.043103,0.025862,0.0,0.12069,...,0.051724,0.017241,0.025862,0.0,0.0,0.181034,0.12069,0.025862,0.172414,0.017241
1,2,M,56,16,0.065972,0.003472,0.194444,0.003472,0.010417,0.0,...,0.083333,0.041667,0.059028,0.006944,0.010417,0.274306,0.086806,0.107639,0.0,0.052083
2,3,M,25,15,0.203252,0.0,0.186992,0.01626,0.04878,0.00813,...,0.04065,0.0,0.04878,0.02439,0.00813,0.065041,0.243902,0.04065,0.02439,0.01626
3,4,M,45,7,0.103448,0.0,0.327586,0.034483,0.034483,0.0,...,0.034483,0.017241,0.155172,0.051724,0.0,0.103448,0.0,0.068966,0.017241,0.051724
4,5,M,25,20,0.025568,0.008523,0.088068,0.0,0.002841,0.008523,...,0.085227,0.059659,0.042614,0.028409,0.022727,0.295455,0.159091,0.110795,0.017045,0.017045


In [173]:
X1 = df_per_user_normalized.drop(columns=['user_id'])
# y1 = df_per_user_normalized['gender']
X1

Unnamed: 0,gender,age,occupation,adventure,film-noir,action,fantasy,western,musical,documentary,...,romance,crime,sci-fi,horror,mystery,drama,comedy,thriller,children's,war
0,F,1,10,0.043103,0.000000,0.043103,0.025862,0.000000,0.120690,0.000000,...,0.051724,0.017241,0.025862,0.000000,0.000000,0.181034,0.120690,0.025862,0.172414,0.017241
1,M,56,16,0.065972,0.003472,0.194444,0.003472,0.010417,0.000000,0.000000,...,0.083333,0.041667,0.059028,0.006944,0.010417,0.274306,0.086806,0.107639,0.000000,0.052083
2,M,25,15,0.203252,0.000000,0.186992,0.016260,0.048780,0.008130,0.000000,...,0.040650,0.000000,0.048780,0.024390,0.008130,0.065041,0.243902,0.040650,0.024390,0.016260
3,M,45,7,0.103448,0.000000,0.327586,0.034483,0.034483,0.000000,0.000000,...,0.034483,0.017241,0.155172,0.051724,0.000000,0.103448,0.000000,0.068966,0.017241,0.051724
4,M,25,20,0.025568,0.008523,0.088068,0.000000,0.002841,0.008523,0.017045,...,0.085227,0.059659,0.042614,0.028409,0.022727,0.295455,0.159091,0.110795,0.017045,0.017045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,F,25,15,0.049083,0.010053,0.091070,0.011236,0.008279,0.018332,0.006505,...,0.072147,0.031342,0.099941,0.043761,0.020106,0.219988,0.154347,0.086931,0.031934,0.024837
6036,F,45,1,0.022059,0.022059,0.068627,0.009804,0.009804,0.009804,0.002451,...,0.053922,0.044118,0.095588,0.022059,0.031863,0.240196,0.144608,0.166667,0.014706,0.039216
6037,F,56,1,0.024390,0.000000,0.048780,0.000000,0.000000,0.000000,0.000000,...,0.146341,0.000000,0.024390,0.048780,0.000000,0.219512,0.292683,0.000000,0.024390,0.097561
6038,F,45,0,0.036101,0.021661,0.028881,0.018051,0.007220,0.151625,0.000000,...,0.108303,0.007220,0.028881,0.003610,0.061372,0.101083,0.234657,0.050542,0.061372,0.032491


In [176]:
y1 = (X1.iloc[:, 3:] > 0.1).astype(int).sum(axis=1)
y1

0       5
1       3
2       3
3       4
4       3
       ..
6035    2
6036    3
6037    3
6038    4
6039    2
Length: 6040, dtype: int64