-
Notifications
You must be signed in to change notification settings - Fork 1
/
tgi.yaml
76 lines (74 loc) · 1.7 KB
/
tgi.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# Pod equivalent to this container:
# docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:0.8 --model-id $model --num-shard $num_shard
---
apiVersion: v1
kind: Pod
metadata:
name: text-generation-inference
labels:
run: text-generation-inference
spec:
containers:
- name: text-generation-inference
image: ghcr.io/huggingface/text-generation-inference:0.8.2
env:
- name: RUST_BACKTRACE
value: "1"
#- name: NCCL_SHM_DISABLE
# value: "1"
#command: ["--model-id", "$model", "--num-shard", "$num_shard"]
command:
- "text-generation-launcher"
- "--model-id"
- "tiiuae/falcon-40b-instruct"
- "--num-shard"
- "2"
ports:
- containerPort: 80
name: http
volumeMounts:
- name: falcon-40b-instruct
mountPath: /data
- name: shm
mountPath: /dev/shm
volumes:
- name: falcon-40b-instruct
persistentVolumeClaim:
claimName: falcon-40b-instruct
- name: shm
emptyDir:
medium: Memory
sizeLimit: 1Gi
nodeSelector:
agentpool: gpunp
tolerations:
- key: sku
operator: Equal
value: gpu
effect: NoSchedule
restartPolicy: Never
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: falcon-40b-instruct
spec:
accessModes:
- ReadWriteOnce
storageClassName: managed-csi-premium
resources:
requests:
storage: 500Gi
---
apiVersion: v1
kind: Service
metadata:
name: text-generation-inference
spec:
ports:
- port: 80
protocol: TCP
targetPort: 80
selector:
run: text-generation-inference
type: ClusterIP