In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
import numpy as np
import matplotlib.pyplot as plt


In [2]:
sc = SparkContext("local")
user_data = sc.textFile("C:/input/spark/ml-100k/u.user")
user_data.first()

'1|24|M|technician|85711'

In [3]:
user_fields = user_data.map(lambda line: line.split("|"))
user_fields.take(5)


[['1', '24', 'M', 'technician', '85711'],
 ['2', '53', 'F', 'other', '94043'],
 ['3', '23', 'M', 'writer', '32067'],
 ['4', '24', 'M', 'technician', '43537'],
 ['5', '33', 'F', 'other', '15213']]

In [4]:
num_users = user_fields.map(lambda fields: fields[0]).count()
num_genders = user_fields.map(lambda fields:fields[2]).distinct().count()

In [5]:
num_occupations = user_fields.map(lambda fields:fields[3]).distinct().count()

In [6]:
num_zipcodes = user_fields.map(lambda fields:fields[4]).distinct().count()

In [7]:
print("User: %d, genders: %d,occupations: %d, zip_codes: %d " %(num_users,num_genders,num_occupations,num_zipcodes))

User: 943, genders: 2,occupations: 21, zip_codes: 795 


In [8]:
ages = user_fields.map(lambda x:int(x[1])).collect()
plt.hist(ages,bins=20,color='lightblue',normed=True)
fig = plt.gcf()
fig.set_size_inches(16,10)
#plt.show()

The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead.
  alternative="'density'", removal="3.1")


In [9]:
count_by_occupation = user_fields.map(lambda fields:(fields[3],1)).reduceByKey(lambda x,y:x+y).collect()
user_fields.take(5)

[['1', '24', 'M', 'technician', '85711'],
 ['2', '53', 'F', 'other', '94043'],
 ['3', '23', 'M', 'writer', '32067'],
 ['4', '24', 'M', 'technician', '43537'],
 ['5', '33', 'F', 'other', '15213']]

In [10]:
x_axis1 = np.array([c[0] for c in count_by_occupation])
y_axis1 = np.array([c[1] for c in count_by_occupation])

In [11]:
x_axis = x_axis1[np.argsort(y_axis1)]
y_axis = y_axis1[np.argsort(y_axis1)]

In [12]:
pos = np.arange(len(x_axis))
width = 1.0
ax = plt.axes()
ax.set_xticks(pos + (width /2))
ax.set_xticklabels(x_axis)

plt.bar(pos,y_axis,width,color='lightblue')
plt.xticks(rotation=30)
fig = plt.gcf()
fig.set_size_inches(16,10)
#plt.show()

  "Adding an axes using the same arguments as a previous axes "


In [13]:
count_by_occupation2 = user_fields.map(lambda fields:fields[3]).countByValue()
print(type(count_by_occupation2))
print("Map-reduce   approach:", dict(count_by_occupation))
print("")
print("countByValue approach:", dict(count_by_occupation2))

<class 'collections.defaultdict'>
Map-reduce   approach: {'marketing': 26, 'scientist': 31, 'doctor': 7, 'lawyer': 12, 'engineer': 67, 'retired': 14, 'student': 196, 'salesman': 12, 'none': 9, 'entertainment': 18, 'educator': 95, 'programmer': 66, 'executive': 32, 'artist': 28, 'administrator': 79, 'technician': 27, 'librarian': 51, 'other': 105, 'writer': 45, 'healthcare': 16, 'homemaker': 7}

countByValue approach: {'scientist': 31, 'retired': 14, 'student': 196, 'salesman': 12, 'entertainment': 18, 'programmer': 66, 'homemaker': 7, 'technician': 27, 'librarian': 51, 'executive': 32, 'doctor': 7, 'other': 105, 'engineer': 67, 'educator': 95, 'lawyer': 12, 'artist': 28, 'healthcare': 16, 'none': 9, 'writer': 45, 'administrator': 79, 'marketing': 26}


In [14]:
movie_data = sc.textFile("C:/input/spark/ml-100k/u.item")
movie_data.take(5)

['1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0',
 '2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0',
 '3|Four Rooms (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0',
 '4|Get Shorty (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)|0|1|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0',
 '5|Copycat (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Copycat%20(1995)|0|0|0|0|0|0|1|0|1|0|0|0|0|0|0|0|1|0|0']

In [15]:
num_movies = movie_data.count()

In [16]:
print("movies: %d" % num_movies)

movies: 1682


In [17]:
def convert_year(x):
    try:
        return int(x[-4:])
    except:
        return 1900 # 若是据缺失年份则默认设定为1900，后续处理中过滤掉这类数据

In [34]:
movie_fields = movie_data.map(lambda lines: lines.split("|"))
#movie_fields.take(5)
print(type(movie_fields))

<class 'pyspark.rdd.PipelinedRDD'>


In [35]:
years = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x))
print(type(years))
years.take(5)

<class 'pyspark.rdd.PipelinedRDD'>


[1995, 1995, 1995, 1995, 1995]

In [37]:
years_filtered = years.filter(lambda x: x!=1900)
years_filtered.take(5)

[1995, 1995, 1995, 1995, 1995]

In [42]:
movie_ages = years_filtered.map(lambda yr: 1998-yr).countByValue()
print(type(movie_ages))
print(movie_ages)
values = movie_ages.values()
print(values)

<class 'collections.defaultdict'>
defaultdict(<class 'int'>, {0: 65, 1: 286, 2: 355, 3: 219, 4: 214, 5: 126, 6: 37, 7: 22, 8: 24, 9: 15, 10: 11, 11: 13, 12: 15, 13: 7, 14: 8, 15: 5, 16: 13, 17: 12, 18: 8, 19: 9, 20: 4, 21: 4, 22: 5, 23: 6, 24: 8, 25: 4, 26: 3, 27: 7, 28: 3, 29: 4, 30: 6, 31: 5, 32: 2, 33: 5, 34: 2, 35: 6, 36: 5, 37: 3, 38: 5, 39: 4, 40: 9, 41: 8, 42: 4, 43: 5, 44: 7, 45: 2, 46: 3, 47: 5, 48: 7, 49: 4, 50: 3, 51: 5, 52: 5, 53: 4, 54: 5, 55: 4, 56: 2, 57: 5, 58: 8, 59: 7, 60: 3, 61: 4, 62: 2, 63: 4, 64: 4, 65: 2, 66: 1, 67: 1, 68: 1, 72: 1, 76: 1})
dict_values([65, 286, 355, 219, 214, 126, 37, 22, 24, 15, 11, 13, 15, 7, 8, 5, 13, 12, 8, 9, 4, 4, 5, 6, 8, 4, 3, 7, 3, 4, 6, 5, 2, 5, 2, 6, 5, 3, 5, 4, 9, 8, 4, 5, 7, 2, 3, 5, 7, 4, 3, 5, 5, 4, 5, 4, 2, 5, 8, 7, 3, 4, 2, 4, 4, 2, 1, 1, 1, 1, 1])


In [55]:
bins = list(movie_ages.keys()) # dict_keys-->list
print(type(bins),bins)
plt.hist(values, bins=bins, color='lightblue', normed=True)
fig = plt.gcf()
fig.set_size_inches(16,10)

<class 'list'> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 72, 76]


The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead.
  alternative="'density'", removal="3.1")


In [60]:
rating_data_raw = sc.textFile("C:/input/spark/ml-100k/u.data")
print(type(rating_data_raw))
print(rating_data_raw.first())

<class 'pyspark.rdd.RDD'>
196	242	3	881250949


In [61]:
num_ratings = rating_data_raw.count()
print("num_ratings: %d" %num_ratings)

num_ratings: 100000


In [72]:
rating_data = rating_data_raw.map(lambda line:line.split("\t"))
ratings = rating_data.map(lambda fields:int(fields[2]))
print(type(ratings),ratings.take(5))
max_rating = ratings.reduce(lambda x,y:max(x,y))
min_rating = ratings.reduce(lambda x,y:min(x,y))
mean_rating = ratings.reduce(lambda x,y:x+y) / num_ratings #均值
median_rating = np.median(ratings.collect()) # numpy求中位数
print(type(ratings.collect()),ratings.collect()[:5],median_rating,)
ratings_per_user = num_ratings / num_users
ratings_per_movie = num_ratings / num_movies

<class 'pyspark.rdd.PipelinedRDD'> [3, 3, 1, 2, 1]
<class 'list'> [3, 3, 1, 2, 1] 4.0


In [79]:
print ("Average rating: %2.2f" % mean_rating)

Average rating: 3.530


In [80]:
ratings.stats()

(count: 100000, mean: 3.529859999999947, stdev: 1.1256679707622548, max: 5.0, min: 1.0)

In [83]:
count_by_rating = ratings.countByValue()
x_axis = np.array(count_by_rating.keys())
y_axis = np.array(count_by_rating.value())

AttributeError: 'collections.defaultdict' object has no attribute 'value'