In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.linalg import svd
from math import *

# Сингулярное разложение


## Типовая матрица

Построю сингулярное разложение для матрицы A:

$$A=\left(
\begin{array}{rrr}
1 & -1 & -2 \\
-7/3 &  1/3 & 2/3\\
1/3 & -7/3 & -2/3 \\
-5/3 & 5/3 & -2/3
\end{array}
\right) \, $$

Составлю матрицы $U$, $\Sigma$, $V$ (в неусеченном виде)

При написании собственной реализации можно использовать *np.linalg.solve* и *np.linalg.inv*

In [None]:
A = np.array([[1, -1, -2],
              [-7/3, 1/3, 2/3],
              [1/3, -7/3, -2/3],
              [-5/3, 5/3, -2/3]])

print(A)

[[ 1.         -1.         -2.        ]
 [-2.33333333  0.33333333  0.66666667]
 [ 0.33333333 -2.33333333 -0.66666667]
 [-1.66666667  1.66666667 -0.66666667]]


In [None]:
### svd() ###

def do_svd(mat):
    U, Sigma, VT = svd(mat)
    U = pd.DataFrame(U)
    VT = pd.DataFrame(VT)
    Sigma = pd.DataFrame(Sigma)
    return U, Sigma, VT

In [None]:
U = do_svd(A)[0]

Sigmas = do_svd(A)[1].to_numpy().T

VT = do_svd(A)[2]

print('U:')
print(U,'\n\n')
print('Sigmas:')
print(Sigmas,'\n\n')
print('V^T:\n',VT,'\n')


Sigma = np.eye(4,3) * Sigmas

print(Sigma,'\n')

print('Check tre Result of (U * Sigma * VT) :')
print(U @ Sigma @ VT)

U:
     0         1         2    3
0 -0.5  0.670820 -0.223607 -0.5
1  0.5 -0.223607 -0.670820 -0.5
2 -0.5 -0.223607 -0.670820  0.5
3  0.5  0.670820 -0.223607  0.5 


Sigmas:
[[4. 2. 2.]] 


V^T:
           0         1         2
0 -0.666667  0.666667  0.333333
1 -0.000000  0.447214 -0.894427
2  0.745356  0.596285  0.298142 

[[4. 0. 0.]
 [0. 2. 0.]
 [0. 0. 2.]
 [0. 0. 0.]] 

Check tre Result of (U * Sigma * VT) :
          0         1         2
0  1.000000 -1.000000 -2.000000
1 -2.333333  0.333333  0.666667
2  0.333333 -2.333333 -0.666667
3 -1.666667  1.666667 -0.666667


## Работа с датасетом

Будем использовать [MovieLens 100k](http://grouplens.org/datasets/movielens/) (*MovieLens Latest Datasets (small)*, ratings.csv)
Мне нужно:
- скачать датасет
- предобработать данные: составить список уникальных пользователей и фильмов
- составить матрицу **users - movies** (функция *create_utility_matrix*); значения матрицы - Nan, если нет информации об оценке пользователем фильма, или число, если оценка есть
- составить сингулярное разложение (используя пакетную функцию или собственную реализацию)
- выбрать одного пользователя (один фильм) и найти максимально похожих на него
- вывести рекомендации для пользователя


In [None]:

data = pd.read_csv("ratings.csv")
print(data.head())
data['userId'] = data['userId'].astype('str')
data['movieId'] = data['movieId'].astype('str')
data['ratingId'] = data['rating'].astype('float')



# list of all users
users = data['userId'].unique()


# list of all movies
movies = data['movieId'].unique()

movies= movies.astype('int')
users = users.astype('int')
print()
print(users[:10])
print()
print(movies)
print(len(movies))

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

[ 1  2  3  4  5  6  7  8  9 10]

[     1      3      6 ... 160836 163937 163981]
9724


In [None]:

def create_utility_matrix(data, formatizer = {'user':0, 'item': 1, 'value': 2}):
    """
        :param data:      Array-like, 2D, nx3
        :param formatizer:pass the formatizer
        :return:          utility matrix (n x m), n=users, m=movies
    """

    itemField = formatizer['item']
    userField = formatizer['user']
    valueField = formatizer['value']

    userList = data.iloc[:,userField].tolist()  #iloc - вытаскивает все значений значения из столбца
    itemList = data.iloc[:,itemField].tolist()
    valueList = data.iloc[:,valueField].tolist()

    users = data.iloc[:,userField].unique().tolist()   # уникальные пользователи
    items = data.iloc[:,itemField].unique().tolist() # уникальные значения

    users_index =  {users[i]: i for i in range(len(users))}

    pd_dict = {item: [np.nan for i in range(len(users))] for item in items}
    # data = data.drop(index=data.index[0], axis=0, inplace=True)
    for i in range(len(data)):
         item = itemList[i]
         user = userList[i]
         value = valueList[i]
         pd_dict[item][users_index[user]] = value
    #
    X = pd.DataFrame(pd_dict)
    X.index = users

    itemcols = list(X.columns)
    items_index = {itemcols[i]: i for i in range(len(itemcols))}
    #
    # # users_index gives us a mapping of user_id to index of user
    # # items_index provides the same for items
    #
    X = X.replace(np.nan, 0)
    return X, users_index, items_index

user_item_array, users_index, items_index = create_utility_matrix(data)
user_item_matrix = np.matrix(user_item_array)


In [None]:
user_item_matrix[25:30,1:10]

matrix([[0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [0. , 3.5, 3. , 3.5, 0. , 0. , 3.5, 0. , 0. ],
        [0. , 0. , 0. , 3.5, 0. , 0. , 0. , 0. , 0. ],
        [0. , 0. , 0. , 0. , 0. , 0. , 5. , 0. , 0. ]])

In [None]:
k = 10 # точность
U = do_svd(user_item_matrix)[0]
VT = do_svd(user_item_matrix)[2]
print(U)

          0         1         2         3         4         5         6    \
0   -0.055554  0.061674 -0.010897  0.000829 -0.092145  0.052507 -0.005511   
1   -0.005866 -0.017738 -0.004423 -0.013342  0.001494 -0.015452 -0.006556   
2   -0.001353  0.002069  0.001715  0.001961 -0.007389 -0.000206 -0.001489   
3   -0.032362  0.054230  0.034165 -0.036600  0.013853  0.007221 -0.039319   
4   -0.011409  0.023086 -0.048545  0.001969  0.024639 -0.032738 -0.006318   
..        ...       ...       ...       ...       ...       ...       ...   
605 -0.111282  0.072420  0.139288 -0.119315  0.236172 -0.010055  0.002122   
606 -0.038982  0.050598 -0.036474 -0.002681 -0.048906  0.023546 -0.019765   
607 -0.116114  0.011847 -0.009763  0.084029 -0.046761  0.101980  0.238187   
608 -0.007579  0.013785 -0.039741  0.001382  0.008063 -0.026422  0.003577   
609 -0.138865 -0.202184  0.092675  0.016582 -0.157839 -0.251973  0.063878   

          7         8         9    ...       600       601       602  \
0  

In [None]:
# по похожести фильмов

def recommend_movie_item(liked_movie, VT, output_num=10):
    global rec
    rec = []
    for item in range(len(VT.columns)):
        if item != liked_movie:
            rec.append([item,np.dot(VT[item],VT[liked_movie])]) # перемножаем векторы фильмов
    final_rec = [i[0] for i in sorted(rec, key=lambda x: x[1],reverse=True)] # сортируем в порядке убывания скалярных произведений
    return final_rec[:output_num]


# по похожести пользователей

def recommend_movie_user(target_user, U, output_num=10):
    global rec
    users_similarity = []
    for user in range(U.shape[0]):
        if user != target_user:
            users_similarity.append([user, np.dot(U.iloc[user], U.iloc[target_user])])
    sorted_users = [i[0] for i in sorted(users_similarity, key=lambda x: x[1],reverse=True)]
    all_movies = np.array(user_item_array.iloc[sorted_users[0]])
    rec_movies = all_movies.nonzero()
    return rec_movies[0][:output_num]


print(f'Индексы фильмов, похожие на 10 {recommend_movie_item(10, VT)}')
print(f'Индекса фильмов, рекомендованные для 10 пользователя {recommend_movie_user(10, U)}')

Индексы фильмов, похожие на 10 [754, 457, 30, 725, 93, 518, 11, 310, 1152, 41]
Индекса фильмов, рекомендованные для 10 пользователя [  0   7  16  20  26  69  84 184 192 203]


А вот еще мои наработки по собственной реализации SVD.

In [None]:
import random as R
import math as M

class Matrix:
    rows = 0
    cols = 0
    elements = []

    def setSize(self, r, c):

        self.rows = r
        self.cols = c

        self.elements = []

        for i in range(self.rows):
            self.elements.append([0] * self.cols)

    def printMatrix(self):
        for i in range(self.rows):
            print(self.elements[i])

    def inputMatrix(self):
        for i in range(self.rows):
            for j in range(self.cols):
                self.elements[i][j] = float(input())

    def setRandom(self):
        for i in range(self.rows):
            for j in range(self.cols):
                self.elements[i][j] = (R.random() - 0.5) * 2.

    def getNorm(self):

        n = 0.
        for i in range(self.rows):
            for j in range(self.cols):
                n += self.elements[i][j] * self.elements[i][j]

        return n

    def getTranspose(self):

        B = Matrix()
        B.setSize(self.cols, self.rows)

        for i in range(self.rows):
            for j in range(self.cols):
                B.elements[j][i] = self.elements[i][j]

        return B


def mult(A, B):
    C = Matrix()

    C.setSize(A.rows, B.cols)
    for i in range(C.rows):
        for j in range(C.cols):
            for k in range(A.cols):
                C.elements[i][j] += A.elements[i][k] * B.elements[k][j]

    return C


def sub(A, B):
    C = Matrix()
    C.setSize(A.rows, A.cols)

    for i in range(C.rows):
        for j in range(C.cols):
            C.elements[i][j] = A.elements[i][j] - B.elements[i][j]

    return C


def updateRow(A, a, b):
    aNorm = a.getNorm()
    for i in range(A.cols):
        b.elements[0][i] = 0
        for j in range(A.rows):
            b.elements[0][i] += A.elements[j][i] * a.elements[j][0] / aNorm


def updateCol(A, a, b):
    bNorm = b.getNorm()
    for i in range(A.rows):
        a.elements[i][0] = 0
        for j in range(A.cols):
            a.elements[i][0] += A.elements[i][j] * b.elements[0][j] / bNorm


def bigStep(A, U, V, iter_numb):
    n = A.getNorm()
    t = 0.00000001

    if n / (A.rows * A.cols) < t:
        U.setSize(A.rows, iter_numb)
        V.setSize(iter_numb, A.cols)

        return


    a = Matrix()
    a.setSize(A.rows, 1)

    b = Matrix()
    b.setSize(1, A.cols)

    a.setRandom()
    updateRow(A, a, b)
    updateCol(A, a, b)

    n0 = mult(a, b).getNorm()

    updateRow(A, a, b)
    updateCol(A, a, b)

    n1 = mult(a, b).getNorm()

    while abs(n1 - n0) > t:
        updateRow(A, a, b)
        updateCol(A, a, b)

        n0 = n1
        n1 = mult(a, b).getNorm()

    A = sub(A, mult(a, b))

    bigStep(A, U, V, iter_numb + 1)

    for i in range(A.rows):
        U.elements[i][iter_numb] = a.elements[i][0]

    for i in range(A.cols):
        V.elements[iter_numb][i] = b.elements[0][i]



try:
    print('Enter rows number')
    r = int(input())

    print('\nEnter cols number')
    c = int(input())

    if r > 0 and c > 0:
        A = Matrix()
        A.setSize(r, c)

        print('\nEnter matrix elements')
        A.inputMatrix()

        print('\nYour matrix is:')
        A.printMatrix()

        U = Matrix()
        V = Matrix()

        bigStep(A, U, V, 0)

        S = Matrix()
        S.setSize(U.cols, U.cols)

        for i in range(S.rows):

            n = 0
            for j in range(U.rows):
                n += U.elements[j][i] * U.elements[j][i]

            n = M.sqrt(n)
            for j in range(U.rows):
                U.elements[j][i] /= n

            S.elements[i][i] = n

            n = 0
            for j in range(V.cols):
                n += V.elements[i][j] * V.elements[i][j]

            n = M.sqrt(n)
            for j in range(V.cols):
                V.elements[i][j] /= n

            S.elements[i][i] *= n

        print("\nLeft matrix is:")
        U.printMatrix()

        print("\nRight matrix is:")
        V.printMatrix()

        print("\nSingular numbers are:")
        S.printMatrix()

        print("\nCheck matrix A:")
        mult(U, mult(S, V)).printMatrix()

        print("\nCheck matrix U:")
        mult(U.getTranspose(), U).printMatrix()

        print("\nCheck matrix V:")
        mult(V, V.getTranspose()).printMatrix()

    else:
        print('Incorrect matrix size')

except ValueError:
    print('Incorrect number format')


Enter rows number
2

Enter cols number
2

Enter matrix elements
1
2
3
4

Your matrix is:
[1.0, 2.0]
[3.0, 4.0]

Left matrix is:
[-0.40455358376513545, 0.9145144010930463]
[-0.9145142961500304, -0.40455334653593816]

Right matrix is:
[-0.576048451029764, -0.8174155504186409]
[-0.8174155504186416, 0.5760484510297631]

Singular numbers are:
[5.464985704219041, 0]
[0, 0.36596619062627017]

Check matrix A:
[0.9999999999999998, 1.9999999999999996]
[3.0, 3.999999999999999]

Check matrix U:
[1.0, -2.594045655501809e-07]
[-2.594045655501809e-07, 1.0]

Check matrix V:
[1.0, 1.1102230246251565e-15]
[1.1102230246251565e-15, 1.0]
