In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import coo_matrix

Считываем данные

In [3]:
def reset_index(df):
    #use index_id
    """Rename index cells by index_id column which is contained in dataframe.
    And drop this column from dataframe.
    
    Parameters
    ----------
    df: {pandas-dataframe} dataframe with index_id column
    
    Returns
    -------
    cur_df: {pandas-dataframe} dataframe which index like index_id and this dataframe
    doesn't contain index_id column
    """
    cur_df = df
    cur_df.index = cur_df.index_id
    cur_df.index = cur_df.index.astype(int)
    cur_df = cur_df.drop('index_id', axis=1)
    return cur_df

In [4]:
def read_file(file_name, columns, encoding='utf-8', index=False):
    """Reading file from path = './data' + file_name.
    
    Parameters
    ----------
    path: {string-like} file name in ./data folder which has .bat extension
    
    columns: {python-list, numpy-array} name of columns which can be found in 
    data description
    
    encoding: {string-like} should be using when it's different from utf-8
    
    Returns
    -------
    df: {pandas-dataframe} which was read from file
    """
    
    with open('./data/%s.dat' % file_name, 'r', encoding=encoding) as f:
        read_data = [l.rstrip().split('::') for l in f]
        df = pd.DataFrame(read_data, columns=columns)      
    if index:
        df = reset_index(df)
    return df

Преобразовываем категориальные признаки:

In [14]:
def get_categorial_features(df, columns):
    """Transform categorial features into one-hot encoding type.
    
    Parameters
    ----------
    df: {pandas-dataframe} dataframe which contains features should be 
        transformed
    
    columns: {python-list, numpy-array} name of columns which are categorial 
    and had to be represent in one-hot-encoding style. They must be a subset of 
    df.columns
    
    Returns
    ------- 
    cur_df: {pandas-dataframe} dataframe with transformed columns 
    """    
    
    cur_df = pd.DataFrame(df.index)
    label = LabelEncoder()
    onehot = OneHotEncoder()

    for feature in columns: 
        label.fit(df[feature])
        improved_features = label.transform(df[feature])
        onehot.fit(improved_features.reshape(-1, 1))  
        cur_df = cur_df.join(pd.DataFrame(onehot.transform(improved_features.reshape(-1, 1))\
                                          .toarray(), columns=feature + '_' + label.classes_))
    return cur_df

Преобразуем жанры к нормальному виду

In [24]:
def get_genres(df, genres):
    """Return genres from dataframe like boolean dataframe.
    
    Parameters
    ----------
    df: {pandas-dataframe} dataframe with data and genres column
    
    genres: {numpy-array, python-list} list with genres which will be a 
    columns name.
    
    Returns
    ------- 
    cur_df: {pandas-dataframe} dataframe boolean like represents genres of 
    film.
    """    
    
    cur_df = pd.DataFrame(index=df.index)
    for i in genres:
        cur_df[i] = 0
#     cur_df = df.reindex(columns=np.append(df.columns, genres), fill_value=0)
    labels = df.apply(label_genre, axis=1)
    for num, row_label in enumerate(labels):
        for one_label in row_label:
            cur_df.loc[cur_df.index[num], one_label] = 1
    return cur_df

def label_genre(row):
    """Get row and separate it genres column by '|' letter.
    
    Parameters
    ----------
    row: {pandas-dataframe} dataframe consisting from one row which contain 'genres'
    column.
    
    Returns
    ------- 
    genres: {python-list} list with genres from cell.
    """ 
    genres = row['genres'].split('|')
    return genres

In [7]:
def to_adjacency_df(df):    
    """From edge dataframe creates adjacency dataframe with shape {user * movies}.
    Cell (i, j) contains rating from user_i to movie_j.
    
    Parameters
    ----------
    df: {pandas-dataframe} dataframe which contains edge list with ratings
    which we use like weights.
    
    Returns
    ------- 
    cur_df: {pandas-dataframe} dataframe represented as adjacency matrix
    """    
    
    as_matrix = df_rates.as_matrix()
    shape = tuple(as_matrix.max(axis=0)[:2] + 1)
    coo = coo_matrix((as_matrix[:, 2], (as_matrix[:, 0], as_matrix[:, 1])), 
                     shape=shape, dtype=as_matrix.dtype)
    coo = coo.todense()
    cur_df = pd.DataFrame(coo[1:, 1:], index=range(1, coo.shape[0]), \
                      columns=range(1, coo.shape[1]))
    return cur_df

In [8]:
def average_based_on_user_movie(df, name='avg', axis=0):
    """Counting sum across axis and non zero values and return dataframe
    containing average values with the same index.
    
    Parameters
    ----------
    df: {pandas-dataframe}
    
    name: {string-like} name of column with average values in returning 
    dataframe
    
    axis: {int-like} 0 or 1 integer which mean axis summing by
    
    Returns
    ------- 
    cur_df: {pandas-dataframe} dataframe with average values across axis
    """   
    avg = df.sum(axis=axis) / (np.count_nonzero(df.as_matrix(), axis=axis))
    cur_df = pd.DataFrame(avg, columns=[name])
    return cur_df

In [63]:
def join_dataframes(main_df, df, id_name):
    """Join dataframes by id_name. In main_df id_name is a column and in 
    df it is an index.
    
    Parameters
    ----------
    main_df: {pandas-dataframe} dataframe is a base in which new features
    will be added
    
    df: {pandas-dataframe} dataframe with new features
    
    id_name: {string-like} string linking two dataframes
    
    Returns
    ------- 
    cur_df: {pandas-dataframe} dataframe with new features
    """  
    cur_df = main_df.copy()
    for feature in df.columns:
        cur_df[feature] = 0
    
    for ind in df.index:
        for feature in df.columns:
            cur_df.loc[main_df[id_name] == ind, feature] = df.loc[ind, feature]
    return cur_df

In [45]:
df_users = read_file('users', ['index_id', 'gender', 'age', 'occupation', 'zip_code'], index=True)
df_movies = read_file('movies', ['index_id', 'title', 'genres'], encoding='latin_1', index=True)
df_rates = read_file('ratings', ['user_id', 'movie_id', 'rating', 'timestamp'])
df_rates = df_rates.astype(int)

In [10]:
df_users_movies = to_adjacency_df(df_rates)

df_users_movies_rating = df_users_movies.copy()
df_users_movies[df_users_movies != 0] = 1

In [142]:
genres = ['Action', 'Adventure', 'Animation', 'Children\'s', \
                             'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', \
                             'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', \
                             'Sci-Fi', 'Thriller', 'War', 'Western']
one_hot_movie = get_genres(df_movies, genres)

In [150]:
joined = join_dataframes(df_rates, one_hot_movie, 'movie_id')