In [23]:
import os
import tensorflow as tf

In [4]:
# 这里的代码针对的是英伟达（vnidia）的 GPU
def check_gpus():
    """检查是否存在 GPU"""
    if not "NVIDIA System Management" in os.popen("nvidia-smi -h").read():
        print("cuda 工具没有安装")
        return False
    gpus_index = os.popen("nvidia-smi --query-gpu=index --format=csv,noheader").readlines()
    if len(gpus_index) < 1:
        print("没有 GPU 存在")
        return False
    print("存在 GPU，总共有 %d 块 GPU 卡" % (len(gpus_index)))
    return True

In [5]:
check_gpus()

存在 GPU，总共有 4 块 GPU 卡


True

In [6]:
def parse(line, qargs):
    """
    line：一行文本
    qargs：查询参数
    解析一行 nvidia-smi 返回 csv 格式文本
    """
    numberic_args = ['memory.free', 'memory.total', 'power.draw', 'power.limit'] #可计数的参数
    power_manage_enable=lambda v:(not 'Not Support' in v) #lambda表达式，显卡是否滋瓷power management（笔记本可能不滋瓷）
    to_numberic = lambda v:float(v.upper().strip().replace('MIB','').replace('W','')) #带单位字符串去掉单位
    process = lambda k,v:((int(to_numberic(v)) if power_manage_enable(v) else 1) if k in numberic_args else v.strip())
    return {k:process(k,v) for k,v in zip(qargs,line.strip().split(','))}

In [7]:
def query_gpu(qargs=[]):
    """
    qargs: 查询参数
    return: a list of dict
    Querying GPUs infos
    查询GPU信息
    """
    qargs =['index','gpu_name', 'memory.free', 'memory.total', 'power.draw', 'power.limit']+ qargs
    cmd = 'nvidia-smi --query-gpu={} --format=csv,noheader'.format(','.join(qargs))
    results = os.popen(cmd).readlines()
    return [parse(line,qargs) for line in results]

In [8]:
query_gpu()

[{'gpu_name': 'Tesla K20c',
  'index': '0',
  'memory.free': 4742,
  'memory.total': 4742,
  'power.draw': 51,
  'power.limit': 225},
 {'gpu_name': 'Tesla K20c',
  'index': '1',
  'memory.free': 4742,
  'memory.total': 4742,
  'power.draw': 49,
  'power.limit': 225},
 {'gpu_name': 'Tesla K20c',
  'index': '2',
  'memory.free': 4742,
  'memory.total': 4742,
  'power.draw': 50,
  'power.limit': 225},
 {'gpu_name': 'Tesla K20c',
  'index': '3',
  'memory.free': 4742,
  'memory.total': 4742,
  'power.draw': 51,
  'power.limit': 225}]

In [14]:
def by_power(d):
    '''
    helper function fo sorting gpus by power
    '''
    power_infos = (d['power.draw'],d['power.limit'])
    if any(v == 1 for v in power_infos):
        print('Power management unable for GPU {}'.format(d['index']))
        return 1
    return float(d['power.draw'])/d['power.limit']

In [15]:
def sorted_by_power(gpus):
    return sorted(gpus, key = by_power)

In [13]:
sorted_by_power(query_gpu())

[{'gpu_name': 'Tesla K20c',
  'index': '1',
  'memory.free': 4742,
  'memory.total': 4742,
  'power.draw': 49,
  'power.limit': 225},
 {'gpu_name': 'Tesla K20c',
  'index': '2',
  'memory.free': 4742,
  'memory.total': 4742,
  'power.draw': 50,
  'power.limit': 225},
 {'gpu_name': 'Tesla K20c',
  'index': '3',
  'memory.free': 4742,
  'memory.total': 4742,
  'power.draw': 50,
  'power.limit': 225},
 {'gpu_name': 'Tesla K20c',
  'index': '0',
  'memory.free': 4742,
  'memory.total': 4742,
  'power.draw': 51,
  'power.limit': 225}]

In [16]:
def sorted_by_memory(gpus, by_size = False):
    """
    将 GPU 按照显存大小或空闲率排序
    """
    if by_size:
        print('Sorted by free memory size')
        return sorted(gpus, key = lambda d:d['memory.free'], reverse=True)
    else:
        print('Sorted by free memory rate')
        return sorted(gpus, key = lambda d:float(d['memory.free'])/ d['memory.total'], reverse=True)

In [18]:
sorted_by_memory(query_gpu())

Sorted by free memory rate


[{'gpu_name': 'Tesla K20c',
  'index': '0',
  'memory.free': 4742,
  'memory.total': 4742,
  'power.draw': 51,
  'power.limit': 225},
 {'gpu_name': 'Tesla K20c',
  'index': '1',
  'memory.free': 4742,
  'memory.total': 4742,
  'power.draw': 49,
  'power.limit': 225},
 {'gpu_name': 'Tesla K20c',
  'index': '2',
  'memory.free': 4742,
  'memory.total': 4742,
  'power.draw': 50,
  'power.limit': 225},
 {'gpu_name': 'Tesla K20c',
  'index': '3',
  'memory.free': 4742,
  'memory.total': 4742,
  'power.draw': 50,
  'power.limit': 225}]

In [19]:
def sorted_by_custom(gpus, key, reverse=False, qargs=[]):
    if isinstance(key, str) and (key in qargs):
        return sorted(gpus, key=lambda d:d[key], reverse=reverse)
    if isinstance(key, type(lambda a:a)):
        return sorted(gpus, key=key, reverse=reverse)
    raise ValueError("The argument 'key' must be a function or a key in query args,please read the documention of nvidia-smi")

In [20]:
sorted_by_custom(query_gpu(), "power.draw")

ValueError: The argument 'key' must be a function or a key in query args,please read the documention of nvidia-smi

In [24]:
class GPUManager():
    '''
    qargs:
        query arguments
    A manager which can list all available GPU devices
    and sort them and choice the most free one.Unspecified 
    ones pref.
    GPU设备管理器，考虑列举出所有可用GPU设备，并加以排序，自动选出
    最空闲的设备。在一个GPUManager对象内会记录每个GPU是否已被指定，
    优先选择未指定的GPU。
    '''
    def __init__(self,qargs=[]):
        '''
        '''
        self.qargs=qargs
        self.gpus=query_gpu(qargs)
        for gpu in self.gpus:
            gpu['specified']=False
        self.gpu_num=len(self.gpus)
    def auto_choice(self, mode=0):
        '''
        mode:
            0:(default)sorted by free memory size
        return:
            a TF device object
        Auto choice the freest GPU device,not specified
        ones 
        自动选择最空闲GPU
        '''
        for old_infos,new_infos in zip(self.gpus,query_gpu(self.qargs)):
            old_infos.update(new_infos)
        unspecified_gpus=[gpu for gpu in self.gpus if not gpu['specified']] or self.gpus
        
        if mode == 0:
            print('Choosing the GPU device has largest free memory...')
            chosen_gpu = sorted_by_memory(unspecified_gpus, True)[0]
        elif mode ==1:
            print('Choosing the GPU device has highest free memory rate...')
            chosen_gpu=self._sort_by_power(unspecified_gpus)[0]
        elif mode == 2:
            print('Choosing the GPU device by power...')
            chosen_gpu = sorted_by_power(unspecified_gpus)[0]
        else:
            print('Given an unaviliable mode,will be chosen by memory')
            chosen_gpu = sorted_by_memory(unspecified_gpus)[0]
        chosen_gpu['specified'] = True
        index = chosen_gpu['index']
        print('Using GPU {i}:\n{info}'.format(i=index,info='\n'.join([str(k)+':'+str(v) for k,v in chosen_gpu.items()])))
        return tf.device('/gpu:{}'.format(index))
    def get_gpu_num(self):
        return self.gpu_num 

In [28]:
if check_gpus():
    gm = GPUManager()
    print(gm.auto_choice())
    print("\n")
    print(gm.auto_choice())
    print("\n")
    print(gm.auto_choice(2))
else:
    print("不存在GPU，将使用cpu")   

存在 GPU，总共有 4 块 GPU 卡
Choosing the GPU device has largest free memory...
Sorted by free memory size
Using GPU 0:
index:0
gpu_name:Tesla K20c
memory.free:4742
memory.total:4742
power.draw:51
power.limit:225
specified:True
<contextlib._GeneratorContextManager object at 0x7f60949c0588>


Choosing the GPU device has largest free memory...
Sorted by free memory size
Using GPU 1:
index:1
gpu_name:Tesla K20c
memory.free:4742
memory.total:4742
power.draw:49
power.limit:225
specified:True
<contextlib._GeneratorContextManager object at 0x7f60949c0710>


Choosing the GPU device by power...
Using GPU 2:
index:2
gpu_name:Tesla K20c
memory.free:4742
memory.total:4742
power.draw:50
power.limit:225
specified:True
<contextlib._GeneratorContextManager object at 0x7f60949c05f8>
