# 数据分析：美国电影

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## 导入电影数据

In [3]:
mdata=pd.read_table(r'C:\Users\telmu\Desktop\PythonJupyterNotebook\Datasets\movies.dat',sep='::',header=None,names=['movie_id', 'title', 'genres'])
mdata[:5]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## 导入用户数据

In [4]:
udata=pd.read_table(r'C:\Users\telmu\Desktop\PythonJupyterNotebook\Datasets\users.dat',sep='::',header=None,names=['user_id', 'gender', 'age','occupation','zip'])
udata[:5]

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


## 导入评级数据

In [5]:
rdata=pd.read_table(r'C:\Users\telmu\Desktop\PythonJupyterNotebook\Datasets\ratings.dat',sep='::',header=None,names=['user_id', 'movie_id', 'rating','timestamp'])
rdata[:5]

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## 合并所有数据

In [6]:
df=pd.merge(pd.merge(rdata,udata),mdata)
df.loc[df['user_id']==1][:10]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1725,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical
2250,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical|Romance
2886,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),Drama
4201,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy
5904,1,1197,3,978302268,F,1,10,48067,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance
8222,1,1287,5,978302039,F,1,10,48067,Ben-Hur (1959),Action|Adventure|Drama
8926,1,2804,5,978300719,F,1,10,48067,"Christmas Story, A (1983)",Comedy|Drama
10278,1,594,4,978302268,F,1,10,48067,Snow White and the Seven Dwarfs (1937),Animation|Children's|Musical
11041,1,919,4,978301368,F,1,10,48067,"Wizard of Oz, The (1939)",Adventure|Children's|Drama|Musical


In [7]:
df.iloc[0]

user_id                                            1
movie_id                                        1193
rating                                             5
timestamp                                  978300760
gender                                             F
age                                                1
occupation                                        10
zip                                            48067
title         One Flew Over the Cuckoo's Nest (1975)
genres                                         Drama
Name: 0, dtype: object

## 按性别划分数据

In [8]:
mean_ratings=df.pivot_table('rating',index='title',columns='gender',aggfunc='mean')
mean_ratings[:5]

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024


In [9]:
sized=df.groupby('title').size()
sized[:10]

title
$1,000,000 Duck (1971)                37
'Night Mother (1986)                  70
'Til There Was You (1997)             52
'burbs, The (1989)                   303
...And Justice for All (1979)        199
1-900 (1994)                           2
10 Things I Hate About You (1999)    700
101 Dalmatians (1961)                565
101 Dalmatians (1996)                364
12 Angry Men (1957)                  616
dtype: int64

In [10]:
actives=sized.index[sized>=250]
actives[:10]

Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '2010 (1984)'],
      dtype='object', name='title')

In [11]:
mean_ratings=mean_ratings.loc[actives]
mean_ratings[:10]

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",2.793478,2.962085
10 Things I Hate About You (1999),3.646552,3.311966
101 Dalmatians (1961),3.791444,3.5
101 Dalmatians (1996),3.24,2.911215
12 Angry Men (1957),4.184397,4.328421
"13th Warrior, The (1999)",3.112,3.168
2 Days in the Valley (1996),3.488889,3.244813
"20,000 Leagues Under the Sea (1954)",3.670103,3.709205
2001: A Space Odyssey (1968),3.825581,4.129738
2010 (1984),3.446809,3.413712


## 男生最喜欢的电影

In [12]:
top_male=mean_ratings.sort_values(by='M',ascending=False)[:10]
top_male

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Godfather, The (1972)",4.3147,4.583333
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),4.481132,4.576628
"Shawshank Redemption, The (1994)",4.539075,4.560625
Raiders of the Lost Ark (1981),4.332168,4.520597
"Usual Suspects, The (1995)",4.513317,4.518248
Star Wars: Episode IV - A New Hope (1977),4.302937,4.495307
Schindler's List (1993),4.562602,4.491415
"Wrong Trousers, The (1993)",4.588235,4.478261
"Close Shave, A (1995)",4.644444,4.473795
Rear Window (1954),4.484536,4.472991


## 女生最喜欢的电影

In [12]:
top_female=mean_ratings.sort_values(by='F',ascending=False)[:10]
top_female

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Close Shave, A (1995)",4.644444,4.473795
"Wrong Trousers, The (1993)",4.588235,4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.57265,4.464589
Wallace & Gromit: The Best of Aardman Animation (1996),4.563107,4.385075
Schindler's List (1993),4.562602,4.491415
"Shawshank Redemption, The (1994)",4.539075,4.560625
"Grand Day Out, A (1992)",4.537879,4.293255
To Kill a Mockingbird (1962),4.536667,4.372611
Creature Comforts (1990),4.513889,4.272277
"Usual Suspects, The (1995)",4.513317,4.518248


## 评分差异最大的男生喜欢的，而女生不喜欢的电影

In [13]:
mean_ratings['diff']=mean_ratings['M']-mean_ratings['F']
sort=mean_ratings.sort_values(by='diff',ascending=False)
male=sort[sort['diff']>0.5]
male[:10]

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Good, The Bad and The Ugly, The (1966)",3.494949,4.2213,0.726351
"Kentucky Fried Movie, The (1977)",2.878788,3.555147,0.676359
Dumb & Dumber (1994),2.697987,3.336595,0.638608
"Longest Day, The (1962)",3.411765,4.031447,0.619682
"Cable Guy, The (1996)",2.25,2.863787,0.613787
Evil Dead II (Dead By Dawn) (1987),3.297297,3.909283,0.611985
"Hidden, The (1987)",3.137931,3.745098,0.607167
Rocky III (1982),2.361702,2.943503,0.581801
Caddyshack (1980),3.396135,3.969737,0.573602
For a Few Dollars More (1965),3.409091,3.953795,0.544704


## 评分差异最大的女生喜欢的，而男生不喜欢的电影

In [14]:
female=sort[sort['diff']<-0.5]
female[:10]

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Rocky Horror Picture Show, The (1975)",3.673016,3.160131,-0.512885
Anastasia (1997),3.8,3.281609,-0.518391
Steel Magnolias (1989),3.901734,3.365957,-0.535777
Little Women (1994),3.870588,3.321739,-0.548849
Grease (1978),3.975265,3.367041,-0.608224
Jumpin' Jack Flash (1986),3.254717,2.578358,-0.676359
Dirty Dancing (1987),3.790378,2.959596,-0.830782


## 观众意见分歧最大的电影

In [15]:
ratings_std=df.groupby('title')['rating'].std()
ratings_std=ratings_std.loc[actives]
ratings_std.sort_values(ascending=False)[:10]

title
Dumb & Dumber (1994)                     1.321333
Blair Witch Project, The (1999)          1.316368
Natural Born Killers (1994)              1.307198
Tank Girl (1995)                         1.277695
Rocky Horror Picture Show, The (1975)    1.260177
Eyes Wide Shut (1999)                    1.259624
Evita (1996)                             1.253631
Billy Madison (1995)                     1.249970
Fear and Loathing in Las Vegas (1998)    1.246408
Bicentennial Man (1999)                  1.245533
Name: rating, dtype: float64

## 观众意见分歧最小的电影

In [16]:
ratings_std.sort_values()[:10]

title
Close Shave, A (1995)                            0.667143
Rear Window (1954)                               0.688946
Great Escape, The (1963)                         0.692585
Shawshank Redemption, The (1994)                 0.700443
Wrong Trousers, The (1993)                       0.708666
Raiders of the Lost Ark (1981)                   0.725647
North by Northwest (1959)                        0.732515
Hustler, The (1961)                              0.737298
Double Indemnity (1944)                          0.740793
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)    0.740924
Name: rating, dtype: float64

In [17]:
genres=mdata[mdata['title'].isin(male.index)]['genres']
genres

228                             Comedy
728                      Action|Sci-Fi
774                             Comedy
1183                    Action|Western
1241    Action|Adventure|Comedy|Horror
1928                            Horror
2340                      Action|Drama
2341                      Action|Drama
2798                     Comedy|Horror
2993                  Action|Drama|War
3352                            Comedy
3483                            Comedy
3507              Action|Horror|Sci-Fi
3612                           Western
3619                            Comedy
3691                            Comedy
Name: genres, dtype: object

## 男生最喜欢的电影类型

In [18]:
def genres(data):
    genres=mdata[mdata['title'].isin(data.index)]['genres']
    all_genres=[]
    for i in genres:
        all_genres.extend(i.split('|'))
    genres=pd.unique(all_genres)
    return genres
male1=genres(male)
male1

array(['Comedy', 'Action', 'Sci-Fi', 'Western', 'Adventure', 'Horror',
       'Drama', 'War'], dtype=object)

## 女生最喜欢的电影类型

In [19]:
female1=genres(female)
female1

array(['Drama', 'Musical', 'Romance', 'Comedy', 'Animation', "Children's",
       'Action', 'Thriller', 'Horror', 'Sci-Fi'], dtype=object)

## 男生最喜欢但女生不喜欢的电影类型

In [20]:
male2=[]
for i in male1:
    if i not in female1:
        male2.append(i)
male2

['Western', 'Adventure', 'War']

## 女生最喜欢但男生不喜欢的电影类型

In [21]:
female2=[]
for i in female1:
    if i not in male1:
        female2.append(i)
female2

['Musical', 'Romance', 'Animation', "Children's", 'Thriller']